-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-50031][SQL] Add the TryParseUrl
expression
#48500
Changes from 10 commits
4656fbd
fbc201b
f49745a
5793bff
3122ac6
a2a1e8c
5071226
470f446
9f2ccc6
3513600
6656eaa
5434b30
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -587,6 +587,7 @@ URL Functions | |
:toctree: api/ | ||
|
||
parse_url | ||
try_parse_url | ||
url_decode | ||
url_encode | ||
try_url_decode | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -333,6 +333,20 @@ def test_rand_functions(self): | |
rndn2 = df.select("key", F.randn(0)).collect() | ||
self.assertEqual(sorted(rndn1), sorted(rndn2)) | ||
|
||
def test_try_parse_url(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we also add a case here that returns null? This is to make sure python works as well with invalid urls. |
||
df = self.spark.createDataFrame( | ||
[("https://spark.apache.org/path?query=1", "QUERY", "query")], | ||
["url", "part", "key"], | ||
) | ||
actual = df.select(F.try_parse_url(df.url, df.part, df.key)).collect() | ||
self.assertEqual(actual, [Row("1")]) | ||
df = self.spark.createDataFrame( | ||
[("inva lid://spark.apache.org/path?query=1", "QUERY", "query")], | ||
["url", "part", "key"], | ||
) | ||
actual = df.select(F.try_parse_url(df.url, df.part, df.key)).collect() | ||
self.assertEqual(actual, [Row(None)]) | ||
|
||
def test_string_functions(self): | ||
string_functions = [ | ||
"upper", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -169,6 +169,37 @@ object ParseUrl { | |
private val REGEXSUBFIX = "=([^&]*)" | ||
} | ||
|
||
/** | ||
* Extracts a part from a URL | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment is useless. Let's remove it. |
||
*/ | ||
@ExpressionDescription( | ||
usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
examples = """ | ||
Examples: | ||
> SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST'); | ||
spark.apache.org | ||
> SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY'); | ||
query=1 | ||
> SELECT _FUNC_('inva lid://spark.apache.org/path?query=1', 'QUERY'); | ||
NULL | ||
> SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query'); | ||
1 | ||
MaxGekk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""", | ||
since = "4.0.0", | ||
group = "url_funcs") | ||
case class TryParseUrl(params: Seq[Expression], replacement: Expression) | ||
extends RuntimeReplaceable with InheritAnalysisRules { | ||
def this(children: Seq[Expression]) = this(children, ParseUrl(children, failOnError = false)) | ||
|
||
override def prettyName: String = "try_parse_url" | ||
|
||
override def parameters: Seq[Expression] = params | ||
|
||
override protected def withNewChildInternal(newChild: Expression): Expression = { | ||
copy(replacement = newChild) | ||
} | ||
} | ||
|
||
/** | ||
* Extracts a part from a URL | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -84,6 +84,50 @@ class UrlFunctionsSuite extends QueryTest with SharedSparkSession { | |
} | ||
} | ||
|
||
test("url try_parse_url function") { | ||
|
||
def testUrl(url: String, expected: Row): Unit = { | ||
checkAnswer(Seq[String]((url)).toDF("url").selectExpr( | ||
"try_parse_url(url, 'HOST')", "try_parse_url(url, 'PATH')", | ||
"try_parse_url(url, 'QUERY')", "try_parse_url(url, 'REF')", | ||
"try_parse_url(url, 'PROTOCOL')", "try_parse_url(url, 'FILE')", | ||
"try_parse_url(url, 'AUTHORITY')", "try_parse_url(url, 'USERINFO')", | ||
"try_parse_url(url, 'QUERY', 'query')"), expected) | ||
} | ||
|
||
testUrl( | ||
"http://[email protected]/path?query=1#Ref", | ||
Row("spark.apache.org", "/path", "query=1", "Ref", | ||
"http", "/path?query=1", "[email protected]", "userinfo", "1")) | ||
|
||
testUrl( | ||
"https://use%20r:pas%[email protected]/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two", | ||
Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two", | ||
"https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%[email protected]", | ||
"use%20r:pas%20s", "x%20y")) | ||
|
||
testUrl( | ||
"http://user:pass@host", | ||
Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null)) | ||
|
||
testUrl( | ||
"http://user:pass@host/", | ||
Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null)) | ||
|
||
testUrl( | ||
"http://user:pass@host/?#", | ||
Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null)) | ||
|
||
testUrl( | ||
"http://user:pass@host/file;param?query;p2", | ||
Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2", | ||
"user:pass@host", "user:pass", null)) | ||
|
||
testUrl( | ||
"inva lid://user:pass@host/file;param?query;p2", | ||
Row(null, null, null, null, null, null, null, null, null)) | ||
} | ||
|
||
test("url encode/decode function") { | ||
def testUrl(url: String, fn: String, expected: Row): Unit = { | ||
checkAnswer(Seq[String]((url)).toDF("url") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we add a one unittest in, e.g., test_functions.py? then the tests will be resued in both Spark connect and classic.