diff --git a/docs/pyspark-migration-guide.md b/docs/pyspark-migration-guide.md index 889941c37bf43..1b8d1fc1c5776 100644 --- a/docs/pyspark-migration-guide.md +++ b/docs/pyspark-migration-guide.md @@ -84,6 +84,9 @@ Please refer [Migration Guide: SQL, Datasets and DataFrame](sql-migration-guide. - Since Spark 3.0, `createDataFrame(..., verifySchema=True)` validates `LongType` as well in PySpark. Previously, `LongType` was not verified and resulted in `None` in case the value overflows. To restore this behavior, `verifySchema` can be set to `False` to disable the validation. + - Since Spark 3.0, `Column.getItem` is fixed such that it does not call `Column.apply`. Consequently, if `Column` is used as an argument to `getItem`, the indexing operator should be used. + For example, `map_col.getItem(col('id'))` should be replaced with `map_col[col('id')]`. + ## Upgrading from PySpark 2.3 to 2.4 - In PySpark, when Arrow optimization is enabled, previously `toPandas` just failed when Arrow optimization is unable to be used whereas `createDataFrame` from Pandas DataFrame allowed the fallback to non-optimization. Now, both `toPandas` and `createDataFrame` from Pandas DataFrame allow the fallback by default, which can be switched off by `spark.sql.execution.arrow.fallback.enabled`. diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 7f12d2324e715..b472a4221cd0c 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -296,14 +296,12 @@ def getItem(self, key): +----+------+ | 1| value| +----+------+ - >>> df.select(df.l[0], df.d["key"]).show() - +----+------+ - |l[0]|d[key]| - +----+------+ - | 1| value| - +----+------+ + + .. versionchanged:: 3.0 + If `key` is a `Column` object, the indexing operator should be used instead. + For example, `map_col.getItem(col('id'))` should be replaced with `map_col[col('id')]`. """ - return self[key] + return _bin_op("getItem")(self, key) @since(1.3) def getField(self, name): diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py index f7f2164dcd350..d9d933110dab5 100644 --- a/python/pyspark/sql/tests/test_column.py +++ b/python/pyspark/sql/tests/test_column.py @@ -18,6 +18,8 @@ import sys +from py4j.protocol import Py4JJavaError + from pyspark.sql import Column, Row from pyspark.sql.types import * from pyspark.sql.utils import AnalysisException @@ -85,7 +87,7 @@ def test_column_operators(self): "Cannot apply 'in' operator against a column", lambda: 1 in cs) - def test_column_getitem(self): + def test_column_apply(self): from pyspark.sql.functions import col self.assertIsInstance(col("foo")[1:3], Column) @@ -93,6 +95,16 @@ def test_column_getitem(self): self.assertIsInstance(col("foo")["bar"], Column) self.assertRaises(ValueError, lambda: col("foo")[0:10:2]) + def test_column_getitem(self): + from pyspark.sql.functions import col, create_map, lit + + map_col = create_map(lit(0), lit(100), lit(1), lit(200)) + self.assertRaisesRegexp( + Py4JJavaError, + "Unsupported literal type class org.apache.spark.sql.Column id", + lambda: map_col.getItem(col('id')) + ) + def test_column_select(self): df = self.df self.assertEqual(self.testData, df.select("*").collect())