From 25dd4254fed71923731fd59838875c0dd1ff665a Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Sun, 13 Feb 2022 09:32:32 +0900 Subject: [PATCH] [SPARK-37507][SQL] Add a new SQL function to_binary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? Introduce a SQL function `to_binary`: Converts the input string to a binary value based on the supplied format (of how to interpret the string). Syntax: ``` to_binary(str_column[, fmt]) ``` where - `fmt` can be a case-insensitive string literal of "hex", "utf-8", "base2", or "base64". - By default, the binary format for conversion is "hex" if `fmt` is omitted. ### Why are the changes needed? `to_binary` is a common function available in many DBMSes, for example: - [TO_VARBYTE function - Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_TO_VARBYTE.html) - [TO_BINARY — Snowflake Documentation](https://docs.snowflake.com/en/sql-reference/functions/to_binary.html) - [Expressions, functions, and operators | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#format_string_as_bytes) - [Teradata Online Documentation | Quick access to technical manuals](https://docs.teradata.com/r/kmuOwjp1zEYg98JsB8fu_A/etRo5aTAY9n5fUPjxSEynw) Introducing it improves compatibility and the ease of migration. In addition, `to_binary` can unify existing Spark functions: `encode`, `unhex`, `unbase64`, and `binary`, which makes API easier to remember and use. ### Does this PR introduce _any_ user-facing change? Yes, a new function for the string to binary conversion with a specified format. ### How was this patch tested? Unit test. Closes #35415 from xinrong-databricks/to_binary. Authored-by: Xinrong Meng Signed-off-by: Hyukjin Kwon --- .../catalyst/analysis/FunctionRegistry.scala | 1 + .../expressions/stringExpressions.scala | 71 +++++++++++ .../sql-functions/sql-expression-schema.md | 3 +- .../sql-tests/inputs/string-functions.sql | 16 +++ .../results/ansi/string-functions.sql.out | 115 +++++++++++++++++- .../results/string-functions.sql.out | 115 +++++++++++++++++- 6 files changed, 318 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index ce7cee5764ce4..7a3809378cfd1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -562,6 +562,7 @@ object FunctionRegistry { expression[Second]("second"), expression[ParseToTimestamp]("to_timestamp"), expression[ParseToDate]("to_date"), + expression[ToBinary]("to_binary"), expression[ToUnixTimestamp]("to_unix_timestamp"), expression[ToUTCTimestamp]("to_utc_timestamp"), expression[ParseToTimestampNTZ]("to_timestamp_ntz"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 2086c6dfe4bdc..f450dd80a8b13 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2538,6 +2538,77 @@ case class Encode(value: Expression, charset: Expression) newLeft: Expression, newRight: Expression): Encode = copy(value = newLeft, charset = newRight) } +/** + * Converts the input expression to a binary value based on the supplied format. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(str[, fmt]) - Converts the input `str` to a binary value based on the supplied `fmt`. + `fmt` can be a case-insensitive string literal of "hex", "utf-8", "base2", or "base64". + By default, the binary format for conversion is "hex" if `fmt` is omitted. + The function returns NULL if at least one of the input parameters is NULL. + """, + examples = """ + Examples: + > SELECT _FUNC_('abc', 'utf-8'); + abc + """, + since = "3.3.0", + group = "string_funcs") +// scalastyle:on line.size.limit +case class ToBinary(expr: Expression, format: Option[Expression], child: Expression) + extends RuntimeReplaceable { + + def this(expr: Expression, format: Expression) = this(expr, Option(format), + format match { + case lit if lit.foldable => + val value = lit.eval() + if (value == null) Literal(null, BinaryType) + else { + value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT) match { + case "hex" => Unhex(expr) + case "utf-8" => Encode(expr, Literal("UTF-8")) + case "base64" => UnBase64(expr) + case "base2" => Cast(expr, BinaryType) + case _ => lit + } + } + + case other => other + } + ) + + def this(expr: Expression) = this(expr, None, Unhex(expr)) + + override def flatArguments: Iterator[Any] = Iterator(expr, format) + override def exprsReplaced: Seq[Expression] = expr +: format.toSeq + + override def prettyName: String = "to_binary" + override def dataType: DataType = BinaryType + + override def checkInputDataTypes(): TypeCheckResult = { + def checkFormat(lit: Expression) = { + if (lit.foldable) { + val value = lit.eval() + value == null || Seq("hex", "utf-8", "base64", "base2").contains( + value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT)) + } else false + } + + if (format.forall(checkFormat)) { + super.checkInputDataTypes() + } else { + TypeCheckResult.TypeCheckFailure( + s"Unsupported encoding format: $format. The format has to be " + + s"a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'") + } + } + + override protected def withNewChildInternal(newChild: Expression): ToBinary = + copy(child = newChild) +} + /** * Formats the number X to a format like '#,###,###.##', rounded to D decimal places, * and returns the result as a string. If D is 0, the result has no decimal point or diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 126960e4fdc94..33ba2b73e6b07 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ ## Summary - - Number of queries: 378 + - Number of queries: 379 - Number of expressions that missing example: 12 - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint ## Schema of Built-in Functions @@ -299,6 +299,7 @@ | org.apache.spark.sql.catalyst.expressions.Tan | tan | SELECT tan(0) | struct | | org.apache.spark.sql.catalyst.expressions.Tanh | tanh | SELECT tanh(0) | struct | | org.apache.spark.sql.catalyst.expressions.TimeWindow | window | SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, window(b, '5 minutes') ORDER BY a, start | struct | +| org.apache.spark.sql.catalyst.expressions.ToBinary | to_binary | SELECT to_binary('abc', 'utf-8') | struct | | org.apache.spark.sql.catalyst.expressions.ToDegrees | degrees | SELECT degrees(3.141592653589793) | struct | | org.apache.spark.sql.catalyst.expressions.ToNumber | to_number | SELECT to_number('454', '999') | struct | | org.apache.spark.sql.catalyst.expressions.ToRadians | radians | SELECT radians(180) | struct | diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 94924a91991b9..9571f3eb6c2bb 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -135,3 +135,19 @@ select to_number('-454', '-000'); select to_number('-454', 'S000'); select to_number('12,454.8-', '00,000.9-'); select to_number('00,454.8-', '00,000.9-'); + +-- to_binary +select to_binary('abc'); +select to_binary('abc', 'utf-8'); +select to_binary('abc', 'base64'); +select to_binary('abc', 'base2'); +select to_binary('abc', 'hex'); +select to_binary('abc', concat('utf', '-8')); +select to_binary('abc', concat('base', '64')); +select to_binary('abc', 'Hex'); +select to_binary('abc', 'UTF-8'); +select to_binary('abc', null); +select to_binary(null, 'utf-8'); +select to_binary(null, null); +select to_binary(null, cast(null as string)); +select to_binary('abc', 'invalidFormat'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 99927c262c5ac..86c90fc1fe34d 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 102 +-- Number of queries: 116 -- !query @@ -824,3 +824,116 @@ select to_number('00,454.8-', '00,000.9-') struct -- !query output -454.8 + + +-- !query +select to_binary('abc') +-- !query schema +struct +-- !query output +� + + +-- !query +select to_binary('abc', 'utf-8') +-- !query schema +struct +-- !query output +abc + + +-- !query +select to_binary('abc', 'base64') +-- !query schema +struct +-- !query output +i� + + +-- !query +select to_binary('abc', 'base2') +-- !query schema +struct +-- !query output +abc + + +-- !query +select to_binary('abc', 'hex') +-- !query schema +struct +-- !query output +� + + +-- !query +select to_binary('abc', concat('utf', '-8')) +-- !query schema +struct +-- !query output +abc + + +-- !query +select to_binary('abc', concat('base', '64')) +-- !query schema +struct +-- !query output +i� + + +-- !query +select to_binary('abc', 'Hex') +-- !query schema +struct +-- !query output +� + + +-- !query +select to_binary('abc', 'UTF-8') +-- !query schema +struct +-- !query output +abc + + +-- !query +select to_binary('abc', null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_binary(null, 'utf-8') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_binary(null, null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_binary(null, cast(null as string)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_binary('abc', 'invalidFormat') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 6baac6148885f..f3852a9527b00 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 102 +-- Number of queries: 116 -- !query @@ -820,3 +820,116 @@ select to_number('00,454.8-', '00,000.9-') struct -- !query output -454.8 + + +-- !query +select to_binary('abc') +-- !query schema +struct +-- !query output +� + + +-- !query +select to_binary('abc', 'utf-8') +-- !query schema +struct +-- !query output +abc + + +-- !query +select to_binary('abc', 'base64') +-- !query schema +struct +-- !query output +i� + + +-- !query +select to_binary('abc', 'base2') +-- !query schema +struct +-- !query output +abc + + +-- !query +select to_binary('abc', 'hex') +-- !query schema +struct +-- !query output +� + + +-- !query +select to_binary('abc', concat('utf', '-8')) +-- !query schema +struct +-- !query output +abc + + +-- !query +select to_binary('abc', concat('base', '64')) +-- !query schema +struct +-- !query output +i� + + +-- !query +select to_binary('abc', 'Hex') +-- !query schema +struct +-- !query output +� + + +-- !query +select to_binary('abc', 'UTF-8') +-- !query schema +struct +-- !query output +abc + + +-- !query +select to_binary('abc', null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_binary(null, 'utf-8') +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_binary(null, null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_binary(null, cast(null as string)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select to_binary('abc', 'invalidFormat') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7