Skip to content

Commit

Permalink
[SPARK-41780][SQL] Should throw INVALID_PARAMETER_VALUE.PATTERN when …
Browse files Browse the repository at this point in the history
…the parameters `regexp` is invalid

### What changes were proposed in this pull request?
In the PR, I propose to throw error classes - `INVALID_PARAMETER_VALUE.PATTERN` when the parameters `regexp` in regexp_replace & regexp_extract & rlike is invalid.

### Why are the changes needed?
Clear error prompt should improve user experience with Spark SQL.
The original error prompt is:
<img width="475" alt="image" src="https://user-images.githubusercontent.com/15246973/210493673-c1de9927-9a18-4f9d-a94c-48735b6c5e5a.png">
Valid: [a\\d]{0,2}
Invalid: [a\\d]{0, 2}
![image](https://user-images.githubusercontent.com/15246973/210494925-cb6c8043-de02-4c8e-9b40-225350422340.png)

### Does this PR introduce _any_ user-facing change?
Yes.

### How was this patch tested?
Add new UT.
Pass GA.

Closes #39383 from panbingkun/SPARK-41780.

Authored-by: panbingkun <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
  • Loading branch information
panbingkun authored and MaxGekk committed Jan 9, 2023
1 parent 6b92cda commit 15a0f55
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,12 @@ abstract class StringRegexExpression extends BinaryExpression
null
} else {
// Let it raise exception if couldn't compile the regex string
Pattern.compile(escape(str))
try {
Pattern.compile(escape(str))
} catch {
case e: PatternSyntaxException =>
throw QueryExecutionErrors.invalidPatternError(prettyName, e.getPattern, e)
}
}

protected def pattern(str: String) = if (cache == null) compile(str) else cache
Expand Down Expand Up @@ -634,7 +639,12 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
if (!p.equals(lastRegex)) {
// regex value changed
lastRegex = p.asInstanceOf[UTF8String].clone()
pattern = Pattern.compile(lastRegex.toString)
try {
pattern = Pattern.compile(lastRegex.toString)
} catch {
case e: PatternSyntaxException =>
throw QueryExecutionErrors.invalidPatternError(prettyName, e.getPattern, e)
}
}
if (!r.equals(lastReplacementInUTF8)) {
// replacement string changed
Expand Down Expand Up @@ -688,7 +698,11 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
if (!$regexp.equals($termLastRegex)) {
// regex value changed
$termLastRegex = $regexp.clone();
$termPattern = $classNamePattern.compile($termLastRegex.toString());
try {
$termPattern = $classNamePattern.compile($termLastRegex.toString());
} catch (java.util.regex.PatternSyntaxException e) {
throw QueryExecutionErrors.invalidPatternError("$prettyName", e.getPattern(), e);
}
}
if (!$rep.equals($termLastReplacementInUTF8)) {
// replacement string changed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,27 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkLiteralRow("abc" rlike _, "^bc", false)
checkLiteralRow("abc" rlike _, "^ab", true)
checkLiteralRow("abc" rlike _, "^bc", false)

intercept[java.util.regex.PatternSyntaxException] {
evaluateWithoutCodegen("abbbbc" rlike "**")
}
intercept[java.util.regex.PatternSyntaxException] {
val regex = $"a".string.at(0)
evaluateWithoutCodegen("abbbbc" rlike regex, create_row("**"))
}
checkError(
exception = intercept[SparkRuntimeException] {
evaluateWithoutCodegen("abbbbc" rlike "**")
},
errorClass = "INVALID_PARAMETER_VALUE.PATTERN",
parameters = Map(
"parameter" -> toSQLId("regexp"),
"functionName" -> toSQLId("rlike"),
"value" -> "'**'")
)
checkError(
exception = intercept[SparkRuntimeException] {
val regex = $"a".string.at(0)
evaluateWithoutCodegen("abbbbc" rlike regex, create_row("**"))
},
errorClass = "INVALID_PARAMETER_VALUE.PATTERN",
parameters = Map(
"parameter" -> toSQLId("regexp"),
"functionName" -> toSQLId("rlike"),
"value" -> "'**'")
)
}

test("RegexReplace") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.sql

import org.apache.spark.SparkRuntimeException
import org.apache.spark.sql.catalyst.expressions.Cast._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.internal.SQLConf
Expand Down Expand Up @@ -663,4 +664,41 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
start = 7,
stop = 47))
}

test("SPARK-41780: INVALID_PARAMETER_VALUE.PATTERN - " +
"invalid parameters `regexp` in regexp_replace & regexp_extract") {
checkError(
exception = intercept[SparkRuntimeException] {
sql("select regexp_replace('', '[a\\\\d]{0, 2}', 'x')").collect()
},
errorClass = "INVALID_PARAMETER_VALUE.PATTERN",
parameters = Map(
"parameter" -> toSQLId("regexp"),
"functionName" -> toSQLId("regexp_replace"),
"value" -> "'[a\\\\d]{0, 2}'"
)
)
checkError(
exception = intercept[SparkRuntimeException] {
sql("select regexp_extract('', '[a\\\\d]{0, 2}', 1)").collect
},
errorClass = "INVALID_PARAMETER_VALUE.PATTERN",
parameters = Map(
"parameter" -> toSQLId("regexp"),
"functionName" -> toSQLId("regexp_extract"),
"value" -> "'[a\\\\d]{0, 2}'"
)
)
checkError(
exception = intercept[SparkRuntimeException] {
sql("select rlike('', '[a\\\\d]{0, 2}')").collect()
},
errorClass = "INVALID_PARAMETER_VALUE.PATTERN",
parameters = Map(
"parameter" -> toSQLId("regexp"),
"functionName" -> toSQLId("rlike"),
"value" -> "'[a\\\\d]{0, 2}'"
)
)
}
}

0 comments on commit 15a0f55

Please sign in to comment.