Skip to content

Commit

Permalink
[SPARK-20290][MINOR][PYTHON][SQL] Add PySpark wrapper for eqNullSafe
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

Adds Python bindings for `Column.eqNullSafe`

## How was this patch tested?

Manual tests, existing unit tests, doc build.

Author: zero323 <[email protected]>

Closes #17605 from zero323/SPARK-20290.
  • Loading branch information
zero323 authored and gatorsmile committed May 1, 2017
1 parent a355b66 commit f0169a1
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 1 deletion.
55 changes: 55 additions & 0 deletions python/pyspark/sql/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,61 @@ def __init__(self, jc):
__ge__ = _bin_op("geq")
__gt__ = _bin_op("gt")

_eqNullSafe_doc = """
Equality test that is safe for null values.
:param other: a value or :class:`Column`
>>> from pyspark.sql import Row
>>> df1 = spark.createDataFrame([
... Row(id=1, value='foo'),
... Row(id=2, value=None)
... ])
>>> df1.select(
... df1['value'] == 'foo',
... df1['value'].eqNullSafe('foo'),
... df1['value'].eqNullSafe(None)
... ).show()
+-------------+---------------+----------------+
|(value = foo)|(value <=> foo)|(value <=> NULL)|
+-------------+---------------+----------------+
| true| true| false|
| null| false| true|
+-------------+---------------+----------------+
>>> df2 = spark.createDataFrame([
... Row(value = 'bar'),
... Row(value = None)
... ])
>>> df1.join(df2, df1["value"] == df2["value"]).count()
0
>>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count()
1
>>> df2 = spark.createDataFrame([
... Row(id=1, value=float('NaN')),
... Row(id=2, value=42.0),
... Row(id=3, value=None)
... ])
>>> df2.select(
... df2['value'].eqNullSafe(None),
... df2['value'].eqNullSafe(float('NaN')),
... df2['value'].eqNullSafe(42.0)
... ).show()
+----------------+---------------+----------------+
|(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|
+----------------+---------------+----------------+
| false| true| false|
| false| false| true|
| true| false| false|
+----------------+---------------+----------------+
.. note:: Unlike Pandas, PySpark doesn't consider NaN values to be NULL.
See the `NaN Semantics`_ for details.
.. _NaN Semantics:
https://spark.apache.org/docs/latest/sql-programming-guide.html#nan-semantics
.. versionadded:: 2.3.0
"""
eqNullSafe = _bin_op("eqNullSafe", _eqNullSafe_doc)

# `and`, `or`, `not` cannot be overloaded in Python,
# so use bitwise operators as boolean operators
__and__ = _bin_op('and')
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,7 +982,7 @@ def test_column_operators(self):
cbool = (ci & ci), (ci | ci), (~ci)
self.assertTrue(all(isinstance(c, Column) for c in cbool))
css = cs.contains('a'), cs.like('a'), cs.rlike('a'), cs.asc(), cs.desc(),\
cs.startswith('a'), cs.endswith('a')
cs.startswith('a'), cs.endswith('a'), ci.eqNullSafe(cs)
self.assertTrue(all(isinstance(c, Column) for c in css))
self.assertTrue(isinstance(ci.cast(LongType()), Column))
self.assertRaisesRegexp(ValueError,
Expand Down

0 comments on commit f0169a1

Please sign in to comment.