From bb5de1f2ef66a4775c8d8bc4f632535d45b3f0b4 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 24 Apr 2017 10:48:06 +0900 Subject: [PATCH 1/6] Fill up documentations for functions in Column API in PySpark --- python/pyspark/sql/column.py | 101 ++++++++++++++---- .../expressions/bitwiseExpressions.scala | 2 +- .../scala/org/apache/spark/sql/Column.scala | 19 ++-- 3 files changed, 90 insertions(+), 32 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 46c1707cb6c37..88a9d4097fd81 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -185,9 +185,44 @@ def __contains__(self, item): "in a string column or 'array_contains' function for an array column.") # bitwise operators - bitwiseOR = _bin_op("bitwiseOR") - bitwiseAND = _bin_op("bitwiseAND") - bitwiseXOR = _bin_op("bitwiseXOR") + _bitwiseOR_doc = """ + Compute bitwise OR of this expression with another expression. + + :param other: a value or :class:`Column` to calculate bitwise or(|) against + this :class:`Column`. + + >>> from pyspark.sql import Row + >>> df3 = spark.createDataFrame([Row(a=170, b=75)]) + >>> df3.select(df3.a.bitwiseOR(df3.b)).collect() + [Row((a | b)=235)] + """ + + _bitwiseAND_doc = """ + Compute bitwise AND of this expression with another expression. + + :param other: a value or :class:`Column` to calculate bitwise and(&) against + this :class:`Column`. + + >>> from pyspark.sql import Row + >>> df3 = spark.createDataFrame([Row(a=170, b=75)]) + >>> df3.select(df3.a.bitwiseAND(df3.b)).collect() + [Row((a & b)=10)] + """ + + _bitwiseXOR_doc = """ + Compute bitwise XOR of this expression with another expression. + + :param other: a value or :class:`Column` to calculate bitwise xor(^) against + this :class:`Column`. + + >>> from pyspark.sql import Row + >>> df3 = spark.createDataFrame([Row(a=170, b=75)]) + >>> df3.select(df3.a.bitwiseXOR(df3.b)).collect() + [Row((a ^ b)=225)] + """ + bitwiseOR = _bin_op("bitwiseOR", _bitwiseOR_doc) + bitwiseAND = _bin_op("bitwiseAND", _bitwiseAND_doc) + bitwiseXOR = _bin_op("bitwiseXOR", _bitwiseXOR_doc) @since(1.3) def getItem(self, key): @@ -195,7 +230,7 @@ def getItem(self, key): An expression that gets an item at position ``ordinal`` out of a list, or gets an item by key out of a dict. - >>> df = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"]) + >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"]) >>> df.select(df.l.getItem(0), df.d.getItem("key")).show() +----+------+ |l[0]|d[key]| @@ -217,7 +252,7 @@ def getField(self, name): An expression that gets a field by name in a StructField. >>> from pyspark.sql import Row - >>> df = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF() + >>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))]) >>> df.select(df.r.getField("b")).show() +---+ |r.b| @@ -251,7 +286,8 @@ def __iter__(self): # string methods _rlike_doc = """ - Return a Boolean :class:`Column` based on a regex match. + SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex + match. :param other: an extended regex expression @@ -259,7 +295,7 @@ def __iter__(self): [Row(age=2, name=u'Alice')] """ _like_doc = """ - Return a Boolean :class:`Column` based on a SQL LIKE match. + SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match. :param other: a SQL LIKE pattern @@ -269,9 +305,9 @@ def __iter__(self): [Row(age=2, name=u'Alice')] """ _startswith_doc = """ - Return a Boolean :class:`Column` based on a string match. + String starts with. Returns a boolean :class:`Column` based on a string match. - :param other: string at end of line (do not use a regex `^`) + :param other: string at start of line (do not use a regex `^`) >>> df.filter(df.name.startswith('Al')).collect() [Row(age=2, name=u'Alice')] @@ -279,7 +315,7 @@ def __iter__(self): [] """ _endswith_doc = """ - Return a Boolean :class:`Column` based on matching end of string. + String ends with. Returns a boolean :class:`Column` based on a string match. :param other: string at end of line (do not use a regex `$`) @@ -288,8 +324,16 @@ def __iter__(self): >>> df.filter(df.name.endswith('ice$')).collect() [] """ + _contains_doc = """ + Contains the other element. Returns a boolean :class:`Column` based on a string match. + + :param other: string in line + + >>> df.filter(df.name.contains('o')).collect() + [Row(age=5, name=u'Bob')] + """ - contains = _bin_op("contains") + contains = ignore_unicode_prefix(_bin_op("contains", _contains_doc)) rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc)) like = ignore_unicode_prefix(_bin_op("like", _like_doc)) startswith = ignore_unicode_prefix(_bin_op("startsWith", _startswith_doc)) @@ -337,26 +381,39 @@ def isin(self, *cols): return Column(jc) # order - asc = _unary_op("asc", "Returns a sort expression based on the" - " ascending order of the given column name.") - desc = _unary_op("desc", "Returns a sort expression based on the" - " descending order of the given column name.") + _asc_doc = """ + Returns an ascending ordering used in sorting. + + >>> from pyspark.sql import Row + >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) + >>> df2.select(df2.name).orderBy(df2.name.asc()).collect() + [Row(name=u'Alice'), Row(name=u'Tom')] + """ + _desc_doc = """ + Returns a descending ordering used in sorting. + + >>> from pyspark.sql import Row + >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) + >>> df2.select(df2.name).orderBy(df2.name.desc()).collect() + [Row(name=u'Tom'), Row(name=u'Alice')] + """ + + asc = ignore_unicode_prefix(_unary_op("asc", _asc_doc)) + desc = ignore_unicode_prefix(_unary_op("desc", _desc_doc)) _isNull_doc = """ - True if the current expression is null. Often combined with - :func:`DataFrame.filter` to select rows with null values. + True if the current expression is null. >>> from pyspark.sql import Row - >>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() + >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) >>> df2.filter(df2.height.isNull()).collect() [Row(height=None, name=u'Alice')] """ _isNotNull_doc = """ - True if the current expression is null. Often combined with - :func:`DataFrame.filter` to select rows with non-null values. + True if the current expression is NOT null. >>> from pyspark.sql import Row - >>> df2 = sc.parallelize([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]).toDF() + >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) >>> df2.filter(df2.height.isNotNull()).collect() [Row(height=80, name=u'Tom')] """ @@ -527,7 +584,7 @@ def _test(): .appName("sql.column tests")\ .getOrCreate() sc = spark.sparkContext - globs['sc'] = sc + globs['spark'] = spark globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala index 2918040771433..425efbb6c96c4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala @@ -86,7 +86,7 @@ case class BitwiseOr(left: Expression, right: Expression) extends BinaryArithmet } /** - * A function that calculates bitwise xor of two numbers. + * A function that calculates bitwise xor({@literal ^}) of two numbers. * * Code generation inherited from BinaryArithmetic. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 43de2de7e7094..0d72ce631a086 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -779,7 +779,7 @@ class Column(val expr: Expression) extends Logging { def isin(list: Any*): Column = withExpr { In(expr, list.map(lit(_).expr)) } /** - * SQL like expression. + * SQL like expression. Returns a boolean column based on a SQL LIKE match. * * @group expr_ops * @since 1.3.0 @@ -787,7 +787,8 @@ class Column(val expr: Expression) extends Logging { def like(literal: String): Column = withExpr { Like(expr, lit(literal).expr) } /** - * SQL RLIKE expression (LIKE with Regex). + * SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex + * match. * * @group expr_ops * @since 1.3.0 @@ -838,7 +839,7 @@ class Column(val expr: Expression) extends Logging { } /** - * Contains the other element. + * Contains the other element. Returns a boolean column based on a string match. * * @group expr_ops * @since 1.3.0 @@ -846,7 +847,7 @@ class Column(val expr: Expression) extends Logging { def contains(other: Any): Column = withExpr { Contains(expr, lit(other).expr) } /** - * String starts with. + * String starts with. Returns a boolean column based on a string match. * * @group expr_ops * @since 1.3.0 @@ -854,7 +855,7 @@ class Column(val expr: Expression) extends Logging { def startsWith(other: Column): Column = withExpr { StartsWith(expr, lit(other).expr) } /** - * String starts with another string literal. + * String starts with another string literal. Returns a boolean column based on a string match. * * @group expr_ops * @since 1.3.0 @@ -862,7 +863,7 @@ class Column(val expr: Expression) extends Logging { def startsWith(literal: String): Column = this.startsWith(lit(literal)) /** - * String ends with. + * String ends with. Returns a boolean column based on a string match. * * @group expr_ops * @since 1.3.0 @@ -870,7 +871,7 @@ class Column(val expr: Expression) extends Logging { def endsWith(other: Column): Column = withExpr { EndsWith(expr, lit(other).expr) } /** - * String ends with another string literal. + * String ends with another string literal. Returns a boolean column based on a string match. * * @group expr_ops * @since 1.3.0 @@ -1008,7 +1009,7 @@ class Column(val expr: Expression) extends Logging { def cast(to: String): Column = cast(CatalystSqlParser.parseDataType(to)) /** - * Returns an ordering used in sorting. + * Returns a descending ordering used in sorting. * {{{ * // Scala * df.sort(df("age").desc) @@ -1083,7 +1084,7 @@ class Column(val expr: Expression) extends Logging { def asc_nulls_first: Column = withExpr { SortOrder(expr, Ascending, NullsFirst, Set.empty) } /** - * Returns an ordering used in sorting, where null values appear after non-null values. + * Returns an ascending ordering used in sorting, where null values appear after non-null values. * {{{ * // Scala: sort a DataFrame by age column in ascending order and null values appearing last. * df.sort(df("age").asc_nulls_last) From 92347de5dd544482e2afb9b69f078727224652bd Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 24 Apr 2017 12:51:58 +0900 Subject: [PATCH 2/6] Match asc/desc in functions.scala, Colum.scala, functions.py and column.py --- python/pyspark/sql/column.py | 4 ++-- .../main/scala/org/apache/spark/sql/Column.scala | 16 ++++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 88a9d4097fd81..254361f3345de 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -382,7 +382,7 @@ def isin(self, *cols): # order _asc_doc = """ - Returns an ascending ordering used in sorting. + Returns a sort expression based on the descending order of the given column name >>> from pyspark.sql import Row >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) @@ -390,7 +390,7 @@ def isin(self, *cols): [Row(name=u'Alice'), Row(name=u'Tom')] """ _desc_doc = """ - Returns a descending ordering used in sorting. + Returns a sort expression based on the descending order of the given column name. >>> from pyspark.sql import Row >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index 0d72ce631a086..b23ab1fa3514a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -1009,7 +1009,7 @@ class Column(val expr: Expression) extends Logging { def cast(to: String): Column = cast(CatalystSqlParser.parseDataType(to)) /** - * Returns a descending ordering used in sorting. + * Returns a sort expression based on the descending order of the column. * {{{ * // Scala * df.sort(df("age").desc) @@ -1024,7 +1024,8 @@ class Column(val expr: Expression) extends Logging { def desc: Column = withExpr { SortOrder(expr, Descending) } /** - * Returns a descending ordering used in sorting, where null values appear before non-null values. + * Returns a sort expression based on the descending order of the column, + * and null values appear before non-null values. * {{{ * // Scala: sort a DataFrame by age column in descending order and null values appearing first. * df.sort(df("age").desc_nulls_first) @@ -1039,7 +1040,8 @@ class Column(val expr: Expression) extends Logging { def desc_nulls_first: Column = withExpr { SortOrder(expr, Descending, NullsFirst, Set.empty) } /** - * Returns a descending ordering used in sorting, where null values appear after non-null values. + * Returns a sort expression based on the descending order of the column, + * and null values appear after non-null values. * {{{ * // Scala: sort a DataFrame by age column in descending order and null values appearing last. * df.sort(df("age").desc_nulls_last) @@ -1054,7 +1056,7 @@ class Column(val expr: Expression) extends Logging { def desc_nulls_last: Column = withExpr { SortOrder(expr, Descending, NullsLast, Set.empty) } /** - * Returns an ascending ordering used in sorting. + * Returns a sort expression based on ascending order of the column. * {{{ * // Scala: sort a DataFrame by age column in ascending order. * df.sort(df("age").asc) @@ -1069,7 +1071,8 @@ class Column(val expr: Expression) extends Logging { def asc: Column = withExpr { SortOrder(expr, Ascending) } /** - * Returns an ascending ordering used in sorting, where null values appear before non-null values. + * Returns a sort expression based on ascending order of the column, + * and null values return before non-null values. * {{{ * // Scala: sort a DataFrame by age column in ascending order and null values appearing first. * df.sort(df("age").asc_nulls_last) @@ -1084,7 +1087,8 @@ class Column(val expr: Expression) extends Logging { def asc_nulls_first: Column = withExpr { SortOrder(expr, Ascending, NullsFirst, Set.empty) } /** - * Returns an ascending ordering used in sorting, where null values appear after non-null values. + * Returns a sort expression based on ascending order of the column, + * and null values appear after non-null values. * {{{ * // Scala: sort a DataFrame by age column in ascending order and null values appearing last. * df.sort(df("age").asc_nulls_last) From af8ac74b624d54b16339083319e33e8af098655e Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 24 Apr 2017 12:52:59 +0900 Subject: [PATCH 3/6] Match functions.scala, Column.scala, functions.py and column.py --- python/pyspark/sql/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 254361f3345de..02867aa57129e 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -382,7 +382,7 @@ def isin(self, *cols): # order _asc_doc = """ - Returns a sort expression based on the descending order of the given column name + Returns a sort expression based on the ascending order of the given column name >>> from pyspark.sql import Row >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) From 2815ff167b0ce9f6e0d2d6ae9f3d4fb0f3ce94d2 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 24 Apr 2017 13:16:56 +0900 Subject: [PATCH 4/6] Consistent newlines --- python/pyspark/sql/column.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 02867aa57129e..3d5cd7ed84b92 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -196,7 +196,6 @@ def __contains__(self, item): >>> df3.select(df3.a.bitwiseOR(df3.b)).collect() [Row((a | b)=235)] """ - _bitwiseAND_doc = """ Compute bitwise AND of this expression with another expression. @@ -208,7 +207,6 @@ def __contains__(self, item): >>> df3.select(df3.a.bitwiseAND(df3.b)).collect() [Row((a & b)=10)] """ - _bitwiseXOR_doc = """ Compute bitwise XOR of this expression with another expression. @@ -220,6 +218,7 @@ def __contains__(self, item): >>> df3.select(df3.a.bitwiseXOR(df3.b)).collect() [Row((a ^ b)=225)] """ + bitwiseOR = _bin_op("bitwiseOR", _bitwiseOR_doc) bitwiseAND = _bin_op("bitwiseAND", _bitwiseAND_doc) bitwiseXOR = _bin_op("bitwiseXOR", _bitwiseXOR_doc) From eaeb4564562272ae021fa1a7a8a083ccc56e5c33 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Tue, 25 Apr 2017 14:18:31 +0900 Subject: [PATCH 5/6] Move _contains_doc up --- python/pyspark/sql/column.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 3d5cd7ed84b92..58f4caf358fef 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -284,6 +284,14 @@ def __iter__(self): raise TypeError("Column is not iterable") # string methods + _contains_doc = """ + Contains the other element. Returns a boolean :class:`Column` based on a string match. + + :param other: string in line + + >>> df.filter(df.name.contains('o')).collect() + [Row(age=5, name=u'Bob')] + """ _rlike_doc = """ SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex match. @@ -323,14 +331,6 @@ def __iter__(self): >>> df.filter(df.name.endswith('ice$')).collect() [] """ - _contains_doc = """ - Contains the other element. Returns a boolean :class:`Column` based on a string match. - - :param other: string in line - - >>> df.filter(df.name.contains('o')).collect() - [Row(age=5, name=u'Bob')] - """ contains = ignore_unicode_prefix(_bin_op("contains", _contains_doc)) rlike = ignore_unicode_prefix(_bin_op("rlike", _rlike_doc)) From 0fd9e37051161bf7d54be2989163e101647f5d85 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sat, 29 Apr 2017 18:54:12 +0900 Subject: [PATCH 6/6] df# to df --- python/pyspark/sql/column.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py index 58f4caf358fef..b8df37f25180f 100644 --- a/python/pyspark/sql/column.py +++ b/python/pyspark/sql/column.py @@ -192,8 +192,8 @@ def __contains__(self, item): this :class:`Column`. >>> from pyspark.sql import Row - >>> df3 = spark.createDataFrame([Row(a=170, b=75)]) - >>> df3.select(df3.a.bitwiseOR(df3.b)).collect() + >>> df = spark.createDataFrame([Row(a=170, b=75)]) + >>> df.select(df.a.bitwiseOR(df.b)).collect() [Row((a | b)=235)] """ _bitwiseAND_doc = """ @@ -203,8 +203,8 @@ def __contains__(self, item): this :class:`Column`. >>> from pyspark.sql import Row - >>> df3 = spark.createDataFrame([Row(a=170, b=75)]) - >>> df3.select(df3.a.bitwiseAND(df3.b)).collect() + >>> df = spark.createDataFrame([Row(a=170, b=75)]) + >>> df.select(df.a.bitwiseAND(df.b)).collect() [Row((a & b)=10)] """ _bitwiseXOR_doc = """ @@ -214,8 +214,8 @@ def __contains__(self, item): this :class:`Column`. >>> from pyspark.sql import Row - >>> df3 = spark.createDataFrame([Row(a=170, b=75)]) - >>> df3.select(df3.a.bitwiseXOR(df3.b)).collect() + >>> df = spark.createDataFrame([Row(a=170, b=75)]) + >>> df.select(df.a.bitwiseXOR(df.b)).collect() [Row((a ^ b)=225)] """ @@ -384,16 +384,16 @@ def isin(self, *cols): Returns a sort expression based on the ascending order of the given column name >>> from pyspark.sql import Row - >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) - >>> df2.select(df2.name).orderBy(df2.name.asc()).collect() + >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) + >>> df.select(df.name).orderBy(df.name.asc()).collect() [Row(name=u'Alice'), Row(name=u'Tom')] """ _desc_doc = """ Returns a sort expression based on the descending order of the given column name. >>> from pyspark.sql import Row - >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) - >>> df2.select(df2.name).orderBy(df2.name.desc()).collect() + >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) + >>> df.select(df.name).orderBy(df.name.desc()).collect() [Row(name=u'Tom'), Row(name=u'Alice')] """ @@ -404,16 +404,16 @@ def isin(self, *cols): True if the current expression is null. >>> from pyspark.sql import Row - >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) - >>> df2.filter(df2.height.isNull()).collect() + >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) + >>> df.filter(df.height.isNull()).collect() [Row(height=None, name=u'Alice')] """ _isNotNull_doc = """ True if the current expression is NOT null. >>> from pyspark.sql import Row - >>> df2 = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) - >>> df2.filter(df2.height.isNotNull()).collect() + >>> df = spark.createDataFrame([Row(name=u'Tom', height=80), Row(name=u'Alice', height=None)]) + >>> df.filter(df.height.isNotNull()).collect() [Row(height=80, name=u'Tom')] """