Skip to content

Commit

Permalink
document mergeSchema directly in API
Browse files Browse the repository at this point in the history
  • Loading branch information
nchammas committed Dec 2, 2019
1 parent 9cd174a commit da50864
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 13 deletions.
14 changes: 8 additions & 6 deletions python/pyspark/sql/readwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,18 +300,20 @@ def table(self, tableName):
return self._df(self._jreader.table(tableName))

@since(1.4)
def parquet(self, *paths):
"""Loads Parquet files, returning the result as a :class:`DataFrame`.
def parquet(self, *paths, **options):
"""
Loads Parquet files, returning the result as a :class:`DataFrame`.
You can set the following Parquet-specific option(s) for reading Parquet files:
* ``mergeSchema``: sets whether we should merge schemas collected from all \
Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \
The default value is specified in ``spark.sql.parquet.mergeSchema``.
:param mergeSchema: sets whether we should merge schemas collected from all
Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``.
The default value is specified in ``spark.sql.parquet.mergeSchema``.
>>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
"""
mergeSchema = options.get('mergeSchema', None)
self._set_opts(mergeSchema=mergeSchema)
return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))

@ignore_unicode_prefix
Expand Down
15 changes: 8 additions & 7 deletions python/pyspark/sql/streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,22 +526,23 @@ def orc(self, path):
raise TypeError("path can be only a single string")

@since(2.0)
def parquet(self, path):
"""Loads a Parquet file stream, returning the result as a :class:`DataFrame`.
You can set the following Parquet-specific option(s) for reading Parquet files:
* ``mergeSchema``: sets whether we should merge schemas collected from all \
Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \
The default value is specified in ``spark.sql.parquet.mergeSchema``.
def parquet(self, path, mergeSchema=None):
"""
Loads a Parquet file stream, returning the result as a :class:`DataFrame`.
.. note:: Evolving.
:param mergeSchema: sets whether we should merge schemas collected from all
Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``.
The default value is specified in ``spark.sql.parquet.mergeSchema``.
>>> parquet_sdf = spark.readStream.schema(sdf_schema).parquet(tempfile.mkdtemp())
>>> parquet_sdf.isStreaming
True
>>> parquet_sdf.schema == sdf_schema
True
"""
self._set_opts(mergeSchema=mergeSchema)
if isinstance(path, basestring):
return self._df(self._jreader.parquet(path))
else:
Expand Down

0 comments on commit da50864

Please sign in to comment.