From 5c01f9491abd623d1243c79e91e33f627a75122c Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Tue, 15 Feb 2022 19:18:16 +0800 Subject: [PATCH 1/7] for wrong format input --- .../sql/catalyst/expressions/stringExpressions.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index f450dd80a8b13..fdba4d08f08ea 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2565,7 +2565,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express case lit if lit.foldable => val value = lit.eval() if (value == null) Literal(null, BinaryType) - else { + else if (value.isInstanceOf[UTF8String]) { value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT) match { case "hex" => Unhex(expr) case "utf-8" => Encode(expr, Literal("UTF-8")) @@ -2573,7 +2573,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express case "base2" => Cast(expr, BinaryType) case _ => lit } - } + } else lit case other => other } @@ -2591,8 +2591,9 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express def checkFormat(lit: Expression) = { if (lit.foldable) { val value = lit.eval() - value == null || Seq("hex", "utf-8", "base64", "base2").contains( - value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT)) + value == null || (value.isInstanceOf[UTF8String] && + Seq("hex", "utf-8", "base64", "base2").contains( + value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT))) } else false } From 2869cb10cf9c82cb397031f63f19f01405a5c1e1 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Tue, 15 Feb 2022 20:51:22 +0800 Subject: [PATCH 2/7] test --- .../resources/sql-tests/inputs/string-functions.sql | 1 + .../sql-tests/results/ansi/string-functions.sql.out | 11 ++++++++++- .../sql-tests/results/string-functions.sql.out | 11 ++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 9571f3eb6c2bb..b3c6001c2e613 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -151,3 +151,4 @@ select to_binary(null, 'utf-8'); select to_binary(null, null); select to_binary(null, cast(null as string)); select to_binary('abc', 'invalidFormat'); +select to_binary('abc', 1); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 86c90fc1fe34d..568688ea86249 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 116 +-- Number of queries: 117 -- !query @@ -937,3 +937,12 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 + + +-- !query +select to_binary('abc', 1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index f3852a9527b00..940cbb4bb4b30 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 116 +-- Number of queries: 117 -- !query @@ -933,3 +933,12 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 + + +-- !query +select to_binary('abc', 1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 From 98634d7bb15ef501f0003afa6f20f91235ae34b2 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Wed, 16 Feb 2022 11:23:33 +0800 Subject: [PATCH 3/7] trigger test From 8be5db9f1e191100b8429c35f3e612057d646402 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Wed, 16 Feb 2022 16:47:00 +0800 Subject: [PATCH 4/7] rmv base2 format support --- .../catalyst/expressions/stringExpressions.scala | 7 +++---- .../sql-tests/inputs/string-functions.sql | 1 - .../results/ansi/string-functions.sql.out | 14 +++----------- .../sql-tests/results/string-functions.sql.out | 14 +++----------- 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index fdba4d08f08ea..f7b2dba51aa3c 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2545,7 +2545,7 @@ case class Encode(value: Expression, charset: Expression) @ExpressionDescription( usage = """ _FUNC_(str[, fmt]) - Converts the input `str` to a binary value based on the supplied `fmt`. - `fmt` can be a case-insensitive string literal of "hex", "utf-8", "base2", or "base64". + `fmt` can be a case-insensitive string literal of "hex", "utf-8", or "base64". By default, the binary format for conversion is "hex" if `fmt` is omitted. The function returns NULL if at least one of the input parameters is NULL. """, @@ -2570,7 +2570,6 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express case "hex" => Unhex(expr) case "utf-8" => Encode(expr, Literal("UTF-8")) case "base64" => UnBase64(expr) - case "base2" => Cast(expr, BinaryType) case _ => lit } } else lit @@ -2592,7 +2591,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express if (lit.foldable) { val value = lit.eval() value == null || (value.isInstanceOf[UTF8String] && - Seq("hex", "utf-8", "base64", "base2").contains( + Seq("hex", "utf-8", "base64").contains( value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT))) } else false } @@ -2602,7 +2601,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express } else { TypeCheckResult.TypeCheckFailure( s"Unsupported encoding format: $format. The format has to be " + - s"a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'") + s"a case-insensitive string literal of 'hex', 'utf-8', or 'base64'") } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index b3c6001c2e613..9f929c64d5cf6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -140,7 +140,6 @@ select to_number('00,454.8-', '00,000.9-'); select to_binary('abc'); select to_binary('abc', 'utf-8'); select to_binary('abc', 'base64'); -select to_binary('abc', 'base2'); select to_binary('abc', 'hex'); select to_binary('abc', concat('utf', '-8')); select to_binary('abc', concat('base', '64')); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 568688ea86249..6647225e5fecb 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 117 +-- Number of queries: 116 -- !query @@ -850,14 +850,6 @@ struct i� --- !query -select to_binary('abc', 'base2') --- !query schema -struct --- !query output -abc - - -- !query select to_binary('abc', 'hex') -- !query schema @@ -936,7 +928,7 @@ select to_binary('abc', 'invalidFormat') struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 +cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7 -- !query @@ -945,4 +937,4 @@ select to_binary('abc', 1) struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 +cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 940cbb4bb4b30..cc854f1719dcb 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 117 +-- Number of queries: 116 -- !query @@ -846,14 +846,6 @@ struct i� --- !query -select to_binary('abc', 'base2') --- !query schema -struct --- !query output -abc - - -- !query select to_binary('abc', 'hex') -- !query schema @@ -932,7 +924,7 @@ select to_binary('abc', 'invalidFormat') struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 +cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7 -- !query @@ -941,4 +933,4 @@ select to_binary('abc', 1) struct<> -- !query output org.apache.spark.sql.AnalysisException -cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 +cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7 From 4b9372695a2cf5418452d5ab623b6cab733ab778 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Thu, 17 Feb 2022 16:25:08 +0800 Subject: [PATCH 5/7] fail when format is cast(null as int) --- .../sql/catalyst/expressions/stringExpressions.scala | 4 ++-- .../resources/sql-tests/inputs/string-functions.sql | 1 + .../sql-tests/results/ansi/string-functions.sql.out | 11 ++++++++++- .../sql-tests/results/string-functions.sql.out | 11 ++++++++++- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index f7b2dba51aa3c..92a2d1766d837 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2562,7 +2562,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express def this(expr: Expression, format: Expression) = this(expr, Option(format), format match { - case lit if lit.foldable => + case lit if (lit.foldable && Seq(StringType, NullType).contains(lit.dataType)) => val value = lit.eval() if (value == null) Literal(null, BinaryType) else if (value.isInstanceOf[UTF8String]) { @@ -2588,7 +2588,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express override def checkInputDataTypes(): TypeCheckResult = { def checkFormat(lit: Expression) = { - if (lit.foldable) { + if (lit.foldable && Seq(StringType, NullType).contains(lit.dataType)) { val value = lit.eval() value == null || (value.isInstanceOf[UTF8String] && Seq("hex", "utf-8", "base64").contains( diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 9f929c64d5cf6..94eb96f6249a0 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -149,5 +149,6 @@ select to_binary('abc', null); select to_binary(null, 'utf-8'); select to_binary(null, null); select to_binary(null, cast(null as string)); +select to_binary(null, cast(null as int)); select to_binary('abc', 'invalidFormat'); select to_binary('abc', 1); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 6647225e5fecb..4c0aa8c948334 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 116 +-- Number of queries: 117 -- !query @@ -922,6 +922,15 @@ struct NULL +-- !query +select to_binary(null, cast(null as int)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'to_binary(NULL, CAST(NULL AS INT))' due to data type mismatch: Unsupported encoding format: Some(ansi_cast(null as int)). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7 + + -- !query select to_binary('abc', 'invalidFormat') -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index cc854f1719dcb..bb2974db2322b 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 116 +-- Number of queries: 117 -- !query @@ -918,6 +918,15 @@ struct NULL +-- !query +select to_binary(null, cast(null as int)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'to_binary(NULL, CAST(NULL AS INT))' due to data type mismatch: Unsupported encoding format: Some(cast(null as int)). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7 + + -- !query select to_binary('abc', 'invalidFormat') -- !query schema From 6d94e0003e71ab7d1284eefa191f981a821a62b3 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Thu, 17 Feb 2022 18:47:26 +0800 Subject: [PATCH 6/7] rmv unnecessary chk --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 92a2d1766d837..6756c9f09881e 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2565,14 +2565,14 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express case lit if (lit.foldable && Seq(StringType, NullType).contains(lit.dataType)) => val value = lit.eval() if (value == null) Literal(null, BinaryType) - else if (value.isInstanceOf[UTF8String]) { + else { value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT) match { case "hex" => Unhex(expr) case "utf-8" => Encode(expr, Literal("UTF-8")) case "base64" => UnBase64(expr) case _ => lit } - } else lit + } case other => other } @@ -2590,7 +2590,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express def checkFormat(lit: Expression) = { if (lit.foldable && Seq(StringType, NullType).contains(lit.dataType)) { val value = lit.eval() - value == null || (value.isInstanceOf[UTF8String] && + value == null || Seq("hex", "utf-8", "base64").contains( value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT))) } else false From 75494c3a0a8c7090d8f77496c96d2c7a8d05fc1b Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Fri, 18 Feb 2022 09:34:23 +0800 Subject: [PATCH 7/7] fix --- .../spark/sql/catalyst/expressions/stringExpressions.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 6756c9f09881e..56cd224dd8c53 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2592,7 +2592,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express val value = lit.eval() value == null || Seq("hex", "utf-8", "base64").contains( - value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT))) + value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT)) } else false }