-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Adds udf regexp_split_to_array (#5501)
* feat: Adds udf regexp_split
- Loading branch information
1 parent
27d8ad5
commit 3766129
Showing
7 changed files
with
442 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
68 changes: 68 additions & 0 deletions
68
ksqldb-engine/src/main/java/io/confluent/ksql/function/udf/string/RegexpSplitToArray.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
/* | ||
* Copyright 2020 Confluent Inc. | ||
* | ||
* Licensed under the Confluent Community License (the "License"); you may not use | ||
* this file except in compliance with the License. You may obtain a copy of the | ||
* License at | ||
* | ||
* http://www.confluent.io/confluent-community-license | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
* WARRANTIES OF ANY KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations under the License. | ||
*/ | ||
|
||
package io.confluent.ksql.function.udf.string; | ||
|
||
import com.google.common.base.Splitter; | ||
import io.confluent.ksql.function.KsqlFunctionException; | ||
import io.confluent.ksql.function.udf.Udf; | ||
import io.confluent.ksql.function.udf.UdfDescription; | ||
import io.confluent.ksql.function.udf.UdfParameter; | ||
import io.confluent.ksql.util.KsqlConstants; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.regex.Pattern; | ||
import java.util.regex.PatternSyntaxException; | ||
|
||
@UdfDescription(name = "regexp_split_to_array", | ||
author = KsqlConstants.CONFLUENT_AUTHOR, | ||
description = "Splits a string into an array of substrings based on a regexp. " | ||
+ "If the regexp is found at the beginning of the string, end of the string, or there " | ||
+ "are contiguous matches in the string, then empty strings are added to the array. " | ||
+ "If the regexp is not found, then the original string is returned as the only " | ||
+ "element in the array. If the regexp is empty, then all characters in the string are " | ||
+ "split.") | ||
public class RegexpSplitToArray { | ||
|
||
@Udf(description = "Splits a string into an array of substrings based on a regexp.") | ||
public List<String> regexpSplit( | ||
@UdfParameter( | ||
description = "The string to be split. If NULL, then function returns NULL.") | ||
final String string, | ||
@UdfParameter( | ||
description = "The regular expression to split the string by. " | ||
+ "If NULL, then function returns NULL.") | ||
final String regexp) { | ||
if (string == null || regexp == null) { | ||
return null; | ||
} | ||
|
||
// Use Guava version to be compatible with other splitting functions. | ||
final Pattern p = getPattern(regexp); | ||
if (regexp.isEmpty() || p.matcher("").matches()) { | ||
return Arrays.asList(p.split(string)); | ||
} else { | ||
return Splitter.on(p).splitToList(string); | ||
} | ||
} | ||
|
||
private Pattern getPattern(final String regexp) { | ||
try { | ||
return Pattern.compile(regexp); | ||
} catch (PatternSyntaxException e) { | ||
throw new KsqlFunctionException("Invalid regular expression pattern: " + regexp, e); | ||
} | ||
} | ||
} |
79 changes: 79 additions & 0 deletions
79
...db-engine/src/test/java/io/confluent/ksql/function/udf/string/RegexpSplitToArrayTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
package io.confluent.ksql.function.udf.string; | ||
|
||
import static org.hamcrest.MatcherAssert.assertThat; | ||
import static org.hamcrest.Matchers.contains; | ||
import static org.hamcrest.Matchers.is; | ||
import static org.hamcrest.Matchers.nullValue; | ||
|
||
import io.confluent.ksql.function.KsqlFunctionException; | ||
import org.junit.Test; | ||
|
||
public class RegexpSplitToArrayTest { | ||
|
||
private final RegexpSplitToArray udf = new RegexpSplitToArray(); | ||
|
||
@Test | ||
public void shouldReturnNull() { | ||
assertThat(udf.regexpSplit(null, "a"), is(nullValue())); | ||
assertThat(udf.regexpSplit("string", null), is(nullValue())); | ||
assertThat(udf.regexpSplit(null, null), is(nullValue())); | ||
} | ||
|
||
@Test | ||
public void shouldReturnOriginalStringOnNotFoundRegexp() { | ||
assertThat(udf.regexpSplit("", "z"), contains("")); | ||
assertThat(udf.regexpSplit("x-y", "z"), contains("x-y")); | ||
} | ||
|
||
@Test | ||
public void shouldSplitAllCharactersByGivenAnEmptyRegexp() { | ||
assertThat(udf.regexpSplit("", ""), contains("")); | ||
assertThat(udf.regexpSplit("x-y", ""), contains("x", "-", "y")); | ||
assertThat(udf.regexpSplit("x", ""), contains("x")); | ||
} | ||
|
||
@Test | ||
public void shouldSplitStringByGivenRegexp() { | ||
assertThat(udf.regexpSplit("x-y", "-"), contains("x", "y")); | ||
assertThat(udf.regexpSplit("x-y", "x"), contains("", "-y")); | ||
assertThat(udf.regexpSplit("x-y", "y"), contains("x-", "")); | ||
assertThat(udf.regexpSplit("a-b-c-d", "-"), contains("a", "b", "c", "d")); | ||
|
||
assertThat(udf.regexpSplit("x-y", "."), contains("", "", "", "")); | ||
assertThat(udf.regexpSplit("a-b-c-b-d", ".b."), contains("a", "c", "d")); | ||
assertThat(udf.regexpSplit("a-b-c", "^.."), contains("", "b-c")); | ||
assertThat(udf.regexpSplit("a-b-ccatd-ecatf", "(-|cat)"), | ||
contains("a", "b", "c", "d", "e", "f")); | ||
} | ||
|
||
@Test | ||
public void shouldSplitAndAddEmptySpacesIfRegexpIsFoundAtTheBeginningOrEnd() { | ||
assertThat(udf.regexpSplit("-A", "-"), contains("", "A")); | ||
assertThat(udf.regexpSplit("-A-B", "-"), contains("", "A", "B")); | ||
assertThat(udf.regexpSplit("A-", "-"), contains("A", "")); | ||
assertThat(udf.regexpSplit("A-B-", "-"), contains("A", "B", "")); | ||
assertThat(udf.regexpSplit("-A-B-", "-"), contains("", "A", "B", "")); | ||
|
||
assertThat(udf.regexpSplit("A", "^"), contains("A")); | ||
assertThat(udf.regexpSplit("A", "$"), contains("A")); | ||
assertThat(udf.regexpSplit("AB", "^"), contains("AB")); | ||
assertThat(udf.regexpSplit("AB", "$"), contains("AB")); | ||
} | ||
|
||
@Test | ||
public void shouldSplitAndAddEmptySpacesIfRegexIsFoundInContiguousPositions() { | ||
assertThat(udf.regexpSplit("A--A", "-"), contains("A", "", "A")); | ||
assertThat(udf.regexpSplit("z--A--z", "-"), contains("z", "", "A", "", "z")); | ||
assertThat(udf.regexpSplit("--A--A", "-"), contains("", "", "A", "", "A")); | ||
assertThat(udf.regexpSplit("A--A--", "-"), contains("A", "", "A", "", "")); | ||
|
||
assertThat(udf.regexpSplit("aababa", "ab"), contains("a", "", "a")); | ||
assertThat(udf.regexpSplit("aababa", "(ab)+"), contains("a", "a")); | ||
assertThat(udf.regexpSplit("aabcda", "(ab|cd)"), contains("a", "", "a")); | ||
} | ||
|
||
@Test(expected = KsqlFunctionException.class) | ||
public void shouldThrowOnInvalidPattern() { | ||
udf.regexpSplit("abcd", "(()"); | ||
} | ||
} |
126 changes: 126 additions & 0 deletions
126
...st/resources/historical_plans/split_-_regexp_split_to_array/6.0.0_1591047957798/plan.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
{ | ||
"plan" : [ { | ||
"@type" : "ksqlPlanV1", | ||
"statementText" : "CREATE STREAM TEST (K STRING KEY, INPUT_STRING STRING, PATTERN STRING) WITH (KAFKA_TOPIC='test_topic', VALUE_FORMAT='JSON');", | ||
"ddlCommand" : { | ||
"@type" : "createStreamV1", | ||
"sourceName" : "TEST", | ||
"schema" : "`K` STRING KEY, `INPUT_STRING` STRING, `PATTERN` STRING", | ||
"topicName" : "test_topic", | ||
"formats" : { | ||
"keyFormat" : { | ||
"format" : "KAFKA" | ||
}, | ||
"valueFormat" : { | ||
"format" : "JSON" | ||
} | ||
} | ||
} | ||
}, { | ||
"@type" : "ksqlPlanV1", | ||
"statementText" : "CREATE STREAM OUTPUT AS SELECT\n TEST.K K,\n REGEXP_SPLIT_TO_ARRAY(TEST.INPUT_STRING, TEST.PATTERN) EXTRACTED\nFROM TEST TEST\nEMIT CHANGES", | ||
"ddlCommand" : { | ||
"@type" : "createStreamV1", | ||
"sourceName" : "OUTPUT", | ||
"schema" : "`K` STRING KEY, `EXTRACTED` ARRAY<STRING>", | ||
"topicName" : "OUTPUT", | ||
"formats" : { | ||
"keyFormat" : { | ||
"format" : "KAFKA" | ||
}, | ||
"valueFormat" : { | ||
"format" : "JSON" | ||
} | ||
} | ||
}, | ||
"queryPlan" : { | ||
"sources" : [ "TEST" ], | ||
"sink" : "OUTPUT", | ||
"physicalPlan" : { | ||
"@type" : "streamSinkV1", | ||
"properties" : { | ||
"queryContext" : "OUTPUT" | ||
}, | ||
"source" : { | ||
"@type" : "streamSelectV1", | ||
"properties" : { | ||
"queryContext" : "Project" | ||
}, | ||
"source" : { | ||
"@type" : "streamSourceV1", | ||
"properties" : { | ||
"queryContext" : "KsqlTopic/Source" | ||
}, | ||
"topicName" : "test_topic", | ||
"formats" : { | ||
"keyFormat" : { | ||
"format" : "KAFKA" | ||
}, | ||
"valueFormat" : { | ||
"format" : "JSON" | ||
} | ||
}, | ||
"sourceSchema" : "`K` STRING KEY, `INPUT_STRING` STRING, `PATTERN` STRING" | ||
}, | ||
"keyColumnNames" : [ "K" ], | ||
"selectExpressions" : [ "REGEXP_SPLIT_TO_ARRAY(INPUT_STRING, PATTERN) AS EXTRACTED" ] | ||
}, | ||
"formats" : { | ||
"keyFormat" : { | ||
"format" : "KAFKA" | ||
}, | ||
"valueFormat" : { | ||
"format" : "JSON" | ||
} | ||
}, | ||
"topicName" : "OUTPUT" | ||
}, | ||
"queryId" : "CSAS_OUTPUT_0" | ||
} | ||
} ], | ||
"configs" : { | ||
"ksql.extension.dir" : "ext", | ||
"ksql.streams.cache.max.bytes.buffering" : "0", | ||
"ksql.security.extension.class" : null, | ||
"ksql.transient.prefix" : "transient_", | ||
"ksql.persistence.wrap.single.values" : "true", | ||
"ksql.authorization.cache.expiry.time.secs" : "30", | ||
"ksql.schema.registry.url" : "", | ||
"ksql.streams.default.deserialization.exception.handler" : "io.confluent.ksql.errors.LogMetricAndContinueExceptionHandler", | ||
"ksql.output.topic.name.prefix" : "", | ||
"ksql.streams.auto.offset.reset" : "earliest", | ||
"ksql.query.pull.enable.standby.reads" : "false", | ||
"ksql.connect.url" : "http://localhost:8083", | ||
"ksql.service.id" : "some.ksql.service.id", | ||
"ksql.internal.topic.min.insync.replicas" : "1", | ||
"ksql.streams.shutdown.timeout.ms" : "300000", | ||
"ksql.internal.topic.replicas" : "1", | ||
"ksql.insert.into.values.enabled" : "true", | ||
"ksql.query.pull.max.allowed.offset.lag" : "9223372036854775807", | ||
"ksql.query.pull.max.qps" : "2147483647", | ||
"ksql.streams.default.production.exception.handler" : "io.confluent.ksql.errors.ProductionExceptionHandlerUtil$LogAndFailProductionExceptionHandler", | ||
"ksql.access.validator.enable" : "auto", | ||
"ksql.streams.bootstrap.servers" : "localhost:0", | ||
"ksql.streams.commit.interval.ms" : "2000", | ||
"ksql.metric.reporters" : "", | ||
"ksql.query.pull.metrics.enabled" : "false", | ||
"ksql.streams.auto.commit.interval.ms" : "0", | ||
"ksql.metrics.extension" : null, | ||
"ksql.streams.topology.optimization" : "all", | ||
"ksql.hidden.topics" : "_confluent.*,__confluent.*,_schemas,__consumer_offsets,__transaction_state,connect-configs,connect-offsets,connect-status,connect-statuses", | ||
"ksql.streams.num.stream.threads" : "4", | ||
"ksql.timestamp.throw.on.invalid" : "false", | ||
"ksql.authorization.cache.max.entries" : "10000", | ||
"ksql.metrics.tags.custom" : "", | ||
"ksql.pull.queries.enable" : "true", | ||
"ksql.udfs.enabled" : "true", | ||
"ksql.udf.enable.security.manager" : "true", | ||
"ksql.connect.worker.config" : "", | ||
"ksql.sink.window.change.log.additional.retention" : "1000000", | ||
"ksql.readonly.topics" : "_confluent.*,__confluent.*,_schemas,__consumer_offsets,__transaction_state,connect-configs,connect-offsets,connect-status,connect-statuses", | ||
"ksql.udf.collect.metrics" : "false", | ||
"ksql.persistent.prefix" : "query_", | ||
"ksql.query.persistent.active.limit" : "2147483647", | ||
"ksql.error.classifier.regex" : "" | ||
} | ||
} |
Oops, something went wrong.