Skip to content

Commit

Permalink
[SPARK-38225][SQL] Adjust input format of function to_binary
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Adjust input `format` of function `to_binary`:
- gracefully fail for the non-string `format` parameter
- remove arguable `base2` format support

### Why are the changes needed?
Currently, function to_binary doesn't deal with the non-string `format` parameter properly.
For example, `spark.sql("select to_binary('abc', 1)")` raises casting error, rather than hint that encoding format is unsupported.

In addition, `base2` format is arguable as discussed [here](#35415 (comment)). We may exclude it following what Snowflake [to_binary](https://docs.snowflake.com/en/sql-reference/functions/to_binary.html) does for now.

### Does this PR introduce _any_ user-facing change?
Yes.

- Better error messages for non-string `format` parameter. For example:

From:
```
scala> spark.sql("select to_binary('abc', 1)")
org.apache.spark.sql.AnalysisException: class java.lang.Integer cannot be cast to class org.apache.spark.unsafe.types.UTF8String (java.lang.Integer is in module java.base of loader 'bootstrap'; org.apache.spark.unsafe.types.UTF8String is in unnamed module of loader 'app'); line 1 pos 7
```

To:
```
scala> spark.sql("select to_binary('abc', 1)")
org.apache.spark.sql.AnalysisException: cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7;
```

- Removed `base2` format support
```
scala> spark.sql("select to_binary('abc', 'base2')").show()
org.apache.spark.sql.AnalysisException: cannot resolve 'to_binary('abc', 'base2')' due to data type mismatch: Unsupported encoding format: Some(base2). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7;
```

### How was this patch tested?
Unit test.

Closes #35533 from xinrong-databricks/to_binary_followup.

Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
xinrong-meng authored and cloud-fan committed Feb 18, 2022
1 parent 3a7eafd commit 9fd9830
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2545,7 +2545,7 @@ case class Encode(value: Expression, charset: Expression)
@ExpressionDescription(
usage = """
_FUNC_(str[, fmt]) - Converts the input `str` to a binary value based on the supplied `fmt`.
`fmt` can be a case-insensitive string literal of "hex", "utf-8", "base2", or "base64".
`fmt` can be a case-insensitive string literal of "hex", "utf-8", or "base64".
By default, the binary format for conversion is "hex" if `fmt` is omitted.
The function returns NULL if at least one of the input parameters is NULL.
""",
Expand All @@ -2562,15 +2562,14 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express

def this(expr: Expression, format: Expression) = this(expr, Option(format),
format match {
case lit if lit.foldable =>
case lit if (lit.foldable && Seq(StringType, NullType).contains(lit.dataType)) =>
val value = lit.eval()
if (value == null) Literal(null, BinaryType)
else {
value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT) match {
case "hex" => Unhex(expr)
case "utf-8" => Encode(expr, Literal("UTF-8"))
case "base64" => UnBase64(expr)
case "base2" => Cast(expr, BinaryType)
case _ => lit
}
}
Expand All @@ -2589,10 +2588,11 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express

override def checkInputDataTypes(): TypeCheckResult = {
def checkFormat(lit: Expression) = {
if (lit.foldable) {
if (lit.foldable && Seq(StringType, NullType).contains(lit.dataType)) {
val value = lit.eval()
value == null || Seq("hex", "utf-8", "base64", "base2").contains(
value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT))
value == null ||
Seq("hex", "utf-8", "base64").contains(
value.asInstanceOf[UTF8String].toString.toLowerCase(Locale.ROOT))
} else false
}

Expand All @@ -2601,7 +2601,7 @@ case class ToBinary(expr: Expression, format: Option[Expression], child: Express
} else {
TypeCheckResult.TypeCheckFailure(
s"Unsupported encoding format: $format. The format has to be " +
s"a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'")
s"a case-insensitive string literal of 'hex', 'utf-8', or 'base64'")
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ select to_number('00,454.8-', '00,000.9-');
select to_binary('abc');
select to_binary('abc', 'utf-8');
select to_binary('abc', 'base64');
select to_binary('abc', 'base2');
select to_binary('abc', 'hex');
select to_binary('abc', concat('utf', '-8'));
select to_binary('abc', concat('base', '64'));
Expand All @@ -150,4 +149,6 @@ select to_binary('abc', null);
select to_binary(null, 'utf-8');
select to_binary(null, null);
select to_binary(null, cast(null as string));
select to_binary(null, cast(null as int));
select to_binary('abc', 'invalidFormat');
select to_binary('abc', 1);
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 116
-- Number of queries: 117


-- !query
Expand Down Expand Up @@ -850,14 +850,6 @@ struct<to_binary(abc, base64):binary>
i�


-- !query
select to_binary('abc', 'base2')
-- !query schema
struct<to_binary(abc, base2):binary>
-- !query output
abc


-- !query
select to_binary('abc', 'hex')
-- !query schema
Expand Down Expand Up @@ -930,10 +922,28 @@ struct<to_binary(NULL, CAST(NULL AS STRING)):binary>
NULL


-- !query
select to_binary(null, cast(null as int))
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'to_binary(NULL, CAST(NULL AS INT))' due to data type mismatch: Unsupported encoding format: Some(ansi_cast(null as int)). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7


-- !query
select to_binary('abc', 'invalidFormat')
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7
cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7


-- !query
select to_binary('abc', 1)
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 116
-- Number of queries: 117


-- !query
Expand Down Expand Up @@ -846,14 +846,6 @@ struct<to_binary(abc, base64):binary>
i�


-- !query
select to_binary('abc', 'base2')
-- !query schema
struct<to_binary(abc, base2):binary>
-- !query output
abc


-- !query
select to_binary('abc', 'hex')
-- !query schema
Expand Down Expand Up @@ -926,10 +918,28 @@ struct<to_binary(NULL, CAST(NULL AS STRING)):binary>
NULL


-- !query
select to_binary(null, cast(null as int))
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'to_binary(NULL, CAST(NULL AS INT))' due to data type mismatch: Unsupported encoding format: Some(cast(null as int)). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7


-- !query
select to_binary('abc', 'invalidFormat')
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', 'base2', or 'base64'; line 1 pos 7
cannot resolve 'to_binary('abc', 'invalidFormat')' due to data type mismatch: Unsupported encoding format: Some(invalidFormat). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7


-- !query
select to_binary('abc', 1)
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'to_binary('abc', 1)' due to data type mismatch: Unsupported encoding format: Some(1). The format has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'; line 1 pos 7

0 comments on commit 9fd9830

Please sign in to comment.