-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-37507][SQL] Add a new SQL function to_binary #35415
Changes from all commits
6c47eb0
e4e86c0
6cf7ad1
dbd834a
71b6bf1
7e94782
ff5120e
7887b43
66f7d49
d6d686c
e7a5afc
352ee8f
561fde8
8db575b
6036dab
40d1d01
e3a616a
c22ac3c
f9e92eb
2d4200c
3568305
9180212
300f1e8
3fd1b77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2538,6 +2538,77 @@ case class Encode(value: Expression, charset: Expression) | |
newLeft: Expression, newRight: Expression): Encode = copy(value = newLeft, charset = newRight) | ||
} | ||
|
||
/** | ||
* Converts the input expression to a binary value based on the supplied format. | ||
*/ | ||
// scalastyle:off line.size.limit | ||
@ExpressionDescription( | ||
usage = """ | ||
_FUNC_(str[, fmt]) - Converts the input `str` to a binary value based on the supplied `fmt`. | ||
`fmt` can be a case-insensitive string literal of "hex", "utf-8", "base2", or "base64". | ||
By default, the binary format for conversion is "hex" if `fmt` is omitted. | ||
The function returns NULL if at least one of the input parameters is NULL. | ||
""", | ||
examples = """ | ||
Examples: | ||
> SELECT _FUNC_('abc', 'utf-8'); | ||
abc | ||
""", | ||
since = "3.3.0", | ||
group = "string_funcs") | ||
// scalastyle:on line.size.limit | ||
case class ToBinary(expr: Expression, format: Option[Expression], child: Expression) | ||
extends RuntimeReplaceable { | ||
|
||
def this(expr: Expression, format: Expression) = this(expr, Option(format), | ||
format match { | ||
case lit if lit.foldable => | ||
val value = lit.eval() | ||
if (value == null) Literal(null, BinaryType) | ||
else { | ||
value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT) match { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, shall we check the type of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added the input check in #35533. |
||
case "hex" => Unhex(expr) | ||
case "utf-8" => Encode(expr, Literal("UTF-8")) | ||
case "base64" => UnBase64(expr) | ||
case "base2" => Cast(expr, BinaryType) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is the same as |
||
case _ => lit | ||
} | ||
} | ||
|
||
case other => other | ||
} | ||
) | ||
HyukjinKwon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def this(expr: Expression) = this(expr, None, Unhex(expr)) | ||
|
||
override def flatArguments: Iterator[Any] = Iterator(expr, format) | ||
override def exprsReplaced: Seq[Expression] = expr +: format.toSeq | ||
|
||
override def prettyName: String = "to_binary" | ||
override def dataType: DataType = BinaryType | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = { | ||
def checkFormat(lit: Expression) = { | ||
if (lit.foldable) { | ||
val value = lit.eval() | ||
value == null || Seq("hex", "utf-8", "base64", "base2").contains( | ||
xinrong-meng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT)) | ||
} else false | ||
} | ||
|
||
if (format.forall(checkFormat)) { | ||
super.checkInputDataTypes() | ||
} else { | ||
TypeCheckResult.TypeCheckFailure( | ||
s"Unsupported encoding format: $format. The format has to be " + | ||
s"a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'") | ||
} | ||
} | ||
|
||
override protected def withNewChildInternal(newChild: Expression): ToBinary = | ||
copy(child = newChild) | ||
} | ||
|
||
/** | ||
* Formats the number X to a format like '#,###,###.##', rounded to D decimal places, | ||
* and returns the result as a string. If D is 0, the result has no decimal point or | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
-- Automatically generated by SQLQueryTestSuite | ||
-- Number of queries: 102 | ||
-- Number of queries: 116 | ||
|
||
|
||
-- !query | ||
|
@@ -824,3 +824,116 @@ select to_number('00,454.8-', '00,000.9-') | |
struct<to_number(00,454.8-, 00,000.9-):decimal(6,1)> | ||
-- !query output | ||
-454.8 | ||
|
||
|
||
-- !query | ||
select to_binary('abc') | ||
-- !query schema | ||
struct<to_binary(abc):binary> | ||
-- !query output | ||
� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'utf-8') | ||
-- !query schema | ||
struct<to_binary(abc, utf-8):binary> | ||
-- !query output | ||
abc | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'base64') | ||
-- !query schema | ||
struct<to_binary(abc, base64):binary> | ||
-- !query output | ||
i� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'base2') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be an error instead? For base2 it is expecting a string of 0s and 1s. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please correct me if I'm wrong: My understanding is We may also exclude Also CC @cloud-fan @HyukjinKwon @gengliangwang There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's exclude |
||
-- !query schema | ||
struct<to_binary(abc, base2):binary> | ||
-- !query output | ||
abc | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'hex') | ||
-- !query schema | ||
struct<to_binary(abc, hex):binary> | ||
-- !query output | ||
� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', concat('utf', '-8')) | ||
-- !query schema | ||
struct<to_binary(abc, concat(utf, -8)):binary> | ||
-- !query output | ||
abc | ||
|
||
|
||
-- !query | ||
select to_binary('abc', concat('base', '64')) | ||
-- !query schema | ||
struct<to_binary(abc, concat(base, 64)):binary> | ||
-- !query output | ||
i� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'Hex') | ||
-- !query schema | ||
struct<to_binary(abc, Hex):binary> | ||
-- !query output | ||
� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'UTF-8') | ||
-- !query schema | ||
struct<to_binary(abc, UTF-8):binary> | ||
-- !query output | ||
abc | ||
|
||
|
||
-- !query | ||
select to_binary('abc', null) | ||
-- !query schema | ||
struct<to_binary(abc, NULL):binary> | ||
-- !query output | ||
NULL | ||
|
||
|
||
-- !query | ||
select to_binary(null, 'utf-8') | ||
-- !query schema | ||
struct<to_binary(NULL, utf-8):binary> | ||
-- !query output | ||
NULL | ||
|
||
|
||
-- !query | ||
select to_binary(null, null) | ||
-- !query schema | ||
struct<to_binary(NULL, NULL):binary> | ||
-- !query output | ||
NULL | ||
|
||
|
||
-- !query | ||
select to_binary(null, cast(null as string)) | ||
-- !query schema | ||
struct<to_binary(NULL, CAST(NULL AS STRING)):binary> | ||
-- !query output | ||
NULL | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'invalidFormat') | ||
-- !query schema | ||
struct<> | ||
-- !query output | ||
org.apache.spark.sql.AnalysisException | ||
cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
-- Automatically generated by SQLQueryTestSuite | ||
-- Number of queries: 102 | ||
-- Number of queries: 116 | ||
|
||
|
||
-- !query | ||
|
@@ -820,3 +820,116 @@ select to_number('00,454.8-', '00,000.9-') | |
struct<to_number(00,454.8-, 00,000.9-):decimal(6,1)> | ||
-- !query output | ||
-454.8 | ||
|
||
|
||
-- !query | ||
select to_binary('abc') | ||
-- !query schema | ||
struct<to_binary(abc):binary> | ||
-- !query output | ||
� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'utf-8') | ||
-- !query schema | ||
struct<to_binary(abc, utf-8):binary> | ||
-- !query output | ||
abc | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'base64') | ||
-- !query schema | ||
struct<to_binary(abc, base64):binary> | ||
-- !query output | ||
i� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'base2') | ||
-- !query schema | ||
struct<to_binary(abc, base2):binary> | ||
-- !query output | ||
abc | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'hex') | ||
-- !query schema | ||
struct<to_binary(abc, hex):binary> | ||
-- !query output | ||
� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', concat('utf', '-8')) | ||
-- !query schema | ||
struct<to_binary(abc, concat(utf, -8)):binary> | ||
-- !query output | ||
abc | ||
|
||
|
||
-- !query | ||
select to_binary('abc', concat('base', '64')) | ||
-- !query schema | ||
struct<to_binary(abc, concat(base, 64)):binary> | ||
-- !query output | ||
i� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'Hex') | ||
-- !query schema | ||
struct<to_binary(abc, Hex):binary> | ||
-- !query output | ||
� | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'UTF-8') | ||
-- !query schema | ||
struct<to_binary(abc, UTF-8):binary> | ||
-- !query output | ||
abc | ||
|
||
|
||
-- !query | ||
select to_binary('abc', null) | ||
-- !query schema | ||
struct<to_binary(abc, NULL):binary> | ||
-- !query output | ||
NULL | ||
|
||
|
||
-- !query | ||
select to_binary(null, 'utf-8') | ||
-- !query schema | ||
struct<to_binary(NULL, utf-8):binary> | ||
-- !query output | ||
NULL | ||
|
||
|
||
-- !query | ||
select to_binary(null, null) | ||
-- !query schema | ||
struct<to_binary(NULL, NULL):binary> | ||
-- !query output | ||
NULL | ||
|
||
|
||
-- !query | ||
select to_binary(null, cast(null as string)) | ||
-- !query schema | ||
struct<to_binary(NULL, CAST(NULL AS STRING)):binary> | ||
-- !query output | ||
NULL | ||
|
||
|
||
-- !query | ||
select to_binary('abc', 'invalidFormat') | ||
-- !query schema | ||
struct<> | ||
-- !query output | ||
org.apache.spark.sql.AnalysisException | ||
cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Intentionally failed query for testing. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Setting
hex
as the default format referencing reference.