From 41384d45c50e31133d64690605df85f0d663916c Mon Sep 17 00:00:00 2001 From: stczwd Date: Wed, 23 Oct 2019 20:04:16 +0800 Subject: [PATCH 1/2] [SPARK-29444][FOLLOWUP] add doc for ignoreNullFields in json generating --- python/pyspark/sql/readwriter.py | 7 +++++-- .../org/apache/spark/sql/catalyst/json/JSONOptions.scala | 4 ++-- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 9 +++++---- .../scala/org/apache/spark/sql/DataFrameWriter.scala | 2 ++ 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index f92face2d0573..7c3580ddb0940 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -788,7 +788,7 @@ def saveAsTable(self, name, format=None, mode=None, partitionBy=None, **options) @since(1.4) def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None, - lineSep=None, encoding=None): + lineSep=None, encoding=None, ignoreNullFields=None): """Saves the content of the :class:`DataFrame` in JSON format (`JSON Lines text format or newline-delimited JSON `_) at the specified path. @@ -817,13 +817,16 @@ def json(self, path, mode=None, compression=None, dateFormat=None, timestampForm the default UTF-8 charset will be used. :param lineSep: defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``. + :param ignoreNullFields: whether to ignore null fields in column/struct + during json generating. If None is set, + it uses the default value, ``true``. >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data')) """ self.mode(mode) self._set_opts( compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat, - lineSep=lineSep, encoding=encoding) + lineSep=lineSep, encoding=encoding, ignoreNullFields=ignoreNullFields) self._jwrite.json(path) @since(1.4) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala index e7bfb77e46c26..4952540f1132d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala @@ -78,8 +78,8 @@ private[sql] class JSONOptions( val dropFieldIfAllNull = parameters.get("dropFieldIfAllNull").map(_.toBoolean).getOrElse(false) // Whether to ignore null fields during json generating - val ignoreNullFields = parameters.getOrElse("ignoreNullFields", - SQLConf.get.jsonGeneratorIgnoreNullFields).toBoolean + val ignoreNullFields = parameters.get("ignoreNullFields").map(_.toBoolean) + .getOrElse(SQLConf.get.jsonGeneratorIgnoreNullFields) // A language tag in IETF BCP 47 format val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 75db52e334b86..fe86b1e41824c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1189,9 +1189,10 @@ object SQLConf { val JSON_GENERATOR_IGNORE_NULL_FIELDS = buildConf("spark.sql.jsonGenerator.ignoreNullFields") - .doc("If false, JacksonGenerator will generate null for null fields in Struct.") - .stringConf - .createWithDefault("true") + .doc("Whether to ignore null fields in column/struct during json generating. " + + "If false, json generator will generate null in Column/Struct.") + .booleanConf + .createWithDefault(true) val FILE_SINK_LOG_DELETION = buildConf("spark.sql.streaming.fileSink.log.deletion") .internal() @@ -2385,7 +2386,7 @@ class SQLConf extends Serializable with Logging { def sessionLocalTimeZone: String = getConf(SQLConf.SESSION_LOCAL_TIMEZONE) - def jsonGeneratorIgnoreNullFields: String = getConf(SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS) + def jsonGeneratorIgnoreNullFields: Boolean = getConf(SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS) def parallelFileListingInStatsComputation: Boolean = getConf(SQLConf.PARALLEL_FILE_LISTING_IN_STATS_COMPUTATION) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 4f88cc6daa331..bd13bc1467bfc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -687,6 +687,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { *
  • `encoding` (by default it is not set): specifies encoding (charset) of saved json * files. If it is not set, the UTF-8 charset will be used.
  • *
  • `lineSep` (default `\n`): defines the line separator that should be used for writing.
  • + *
  • `ignoreNullFields` (default `true`): whether to ignore null fields in column/struct + * during json generating.
  • * * * @since 1.4.0 From 40bb5151015191d975a141daaedf4bc1afcd0486 Mon Sep 17 00:00:00 2001 From: stczwd Date: Thu, 24 Oct 2019 16:39:27 +0800 Subject: [PATCH 2/2] [SPARK-29444] change ignoreNullFields doc --- python/pyspark/sql/readwriter.py | 5 ++--- .../main/scala/org/apache/spark/sql/internal/SQLConf.scala | 5 +++-- .../main/scala/org/apache/spark/sql/DataFrameWriter.scala | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 7c3580ddb0940..18fd7de7ee547 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -817,9 +817,8 @@ def json(self, path, mode=None, compression=None, dateFormat=None, timestampForm the default UTF-8 charset will be used. :param lineSep: defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``. - :param ignoreNullFields: whether to ignore null fields in column/struct - during json generating. If None is set, - it uses the default value, ``true``. + :param ignoreNullFields: Whether to ignore null fields when generating JSON objects. + If None is set, it uses the default value, ``true``. >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data')) """ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index fe86b1e41824c..4e1c71c7bfae2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -1189,8 +1189,9 @@ object SQLConf { val JSON_GENERATOR_IGNORE_NULL_FIELDS = buildConf("spark.sql.jsonGenerator.ignoreNullFields") - .doc("Whether to ignore null fields in column/struct during json generating. " + - "If false, json generator will generate null in Column/Struct.") + .doc("Whether to ignore null fields when generating JSON objects in JSON data source and " + + "JSON functions such as to_json. " + + "If false, it generates null for null fields in JSON objects.") .booleanConf .createWithDefault(true) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index bd13bc1467bfc..68127c27a8cc2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -687,8 +687,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { *
  • `encoding` (by default it is not set): specifies encoding (charset) of saved json * files. If it is not set, the UTF-8 charset will be used.
  • *
  • `lineSep` (default `\n`): defines the line separator that should be used for writing.
  • - *
  • `ignoreNullFields` (default `true`): whether to ignore null fields in column/struct - * during json generating.
  • + *
  • `ignoreNullFields` (default `true`): Whether to ignore null fields + * when generating JSON objects.
  • * * * @since 1.4.0