Skip to content

Commit

Permalink
Add regexp_like, improve docs and examples for regexp_match` (#9137)
Browse files Browse the repository at this point in the history
* Add regexp_like scalar function

* prettier update.

* Revert changes outside of the regexp_like PR.

* Moved dataframe examples to docs on the actual functions and extracted the test data to a csv file.

* Changed pathing to make ci script happy.

* improved examples with expected output.
  • Loading branch information
Omega359 authored Feb 9, 2024
1 parent 071dc99 commit 30d2be9
Show file tree
Hide file tree
Showing 18 changed files with 1,316 additions and 134 deletions.
7 changes: 4 additions & 3 deletions datafusion-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,21 +52,22 @@ cargo run --example csv_sql
- [`dataframe_output.rs`](examples/dataframe_output.rs): Examples of methods which write data out from a DataFrame
- [`dataframe_in_memory.rs`](examples/dataframe_in_memory.rs): Run a query using a DataFrame against data in memory
- [`deserialize_to_struct.rs`](examples/deserialize_to_struct.rs): Convert query results into rust structs using serde
- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and anaylze `Expr`s
- [`expr_api.rs`](examples/expr_api.rs): Create, execute, simplify and analyze `Expr`s
- [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run DataFusion as a standalone process and execute SQL queries from JDBC clients
- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
- [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es
- [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file
- [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files
- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
- [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
- [`rewrite_expr.rs`](examples/rewrite_expr.rs): Define and invoke a custom Query Optimizer pass
- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using to_timestamp functions
- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF)
- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF)
- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
- [`make_date.rs`](examples/make_date.rs): Examples of using the make_date function
- [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using the to_timestamp functions
- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF)

## Distributed
Expand Down
345 changes: 345 additions & 0 deletions datafusion-examples/examples/regexp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,345 @@
// Licensed to the Apache Software Foundation (ASF) under one
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use datafusion::error::Result;
use datafusion::prelude::*;
use datafusion_common::{assert_batches_eq, assert_contains};

/// This example demonstrates how to use the regexp_* functions
///
/// the full list of supported features and
/// syntax can be found at
/// https://docs.rs/regex/latest/regex/#syntax
///
/// Supported flags can be found at
/// https://docs.rs/regex/latest/regex/#grouping-and-flags
#[tokio::main]
async fn main() -> Result<()> {
let ctx = SessionContext::new();
ctx.register_csv(
"examples",
"../../datafusion/physical-expr/tests/data/regex.csv",
CsvReadOptions::new(),
)
.await?;

//
//
//regexp_like examples
//
//
// regexp_like format is (regexp_like(text, regex[, flags])
//

// use sql and regexp_like function to test col 'values', against patterns in col 'patterns' without flags
let result = ctx
.sql("select regexp_like(values, patterns) from examples")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+------------------------------------------------+",
"| regexp_like(examples.values,examples.patterns) |",
"+------------------------------------------------+",
"| true |",
"| true |",
"| false |",
"| false |",
"| false |",
"| false |",
"| true |",
"| true |",
"| true |",
"| true |",
"| true |",
"+------------------------------------------------+",
],
&result
);

// use sql and regexp_like function to test col 'values', against patterns in col 'patterns' with flags
let result = ctx
.sql("select regexp_like(values, patterns, flags) from examples")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+---------------------------------------------------------------+",
"| regexp_like(examples.values,examples.patterns,examples.flags) |",
"+---------------------------------------------------------------+",
"| true |",
"| true |",
"| true |",
"| false |",
"| false |",
"| false |",
"| true |",
"| true |",
"| true |",
"| true |",
"| true |",
"+---------------------------------------------------------------+",
],
&result
);

// literals work as well
// to match against the entire input use ^ and $ in the regex
let result = ctx
.sql("select regexp_like('John Smith', '^.*Smith$'), regexp_like('Smith Jones', '^Smith.*$')")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+---------------------------------------------------+----------------------------------------------------+",
"| regexp_like(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_like(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
"+---------------------------------------------------+----------------------------------------------------+",
"| true | true |",
"+---------------------------------------------------+----------------------------------------------------+",
],
&result
);

// look-around and back references are not supported for performance
// reasons.
// Note that an error may not always be returned but the result
// if returned will always be false
let result = ctx
.sql(r"select regexp_like('(?<=[A-Z]\w )Smith', 'John Smith', 'i') as a")
.await?
.collect()
.await;

assert!(result.is_ok());
let result = result.unwrap();
assert_eq!(result.len(), 1);

assert_batches_eq!(
&[
"+-------+",
"| a |",
"+-------+",
"| false |",
"+-------+",
],
&result
);

// invalid flags will result in an error
let result = ctx
.sql(r"select regexp_like('\b4(?!000)\d\d\d\b', 4010, 'g')")
.await?
.collect()
.await;

let expected = "regexp_like() does not support the \"global\" option";
assert_contains!(result.unwrap_err().to_string(), expected);

// there is a size limit on the regex during regex compilation
let result = ctx
.sql("select regexp_like('aaaaa', 'a{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}{5}')")
.await?
.collect()
.await;

let expected = "Regular expression did not compile: CompiledTooBig";
assert_contains!(result.unwrap_err().to_string(), expected);

//
//
//regexp_match examples
//
//
// regexp_match format is (regexp_match(text, regex[, flags])
//

let _ = ctx.table("examples").await?;

// use sql and regexp_match function to test col 'values', against patterns in col 'patterns' without flags
let result = ctx
.sql("select regexp_match(values, patterns) from examples")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+-------------------------------------------------+",
"| regexp_match(examples.values,examples.patterns) |",
"+-------------------------------------------------+",
"| [a] |",
"| [A] |",
"| |",
"| |",
"| |",
"| |",
"| [010] |",
"| [Düsseldorf] |",
"| [Москва] |",
"| [Köln] |",
"| [اليوم] |",
"+-------------------------------------------------+",
],
&result
);

// use dataframe and regexp_match function to test col 'values', against patterns in col 'patterns' with flags
let result = ctx
.sql("select regexp_match(values, patterns, flags) from examples")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+----------------------------------------------------------------+",
"| regexp_match(examples.values,examples.patterns,examples.flags) |",
"+----------------------------------------------------------------+",
"| [a] |",
"| [A] |",
"| [B] |",
"| |",
"| |",
"| |",
"| [010] |",
"| [Düsseldorf] |",
"| [Москва] |",
"| [Köln] |",
"| [اليوم] |",
"+----------------------------------------------------------------+",
],
&result
);

// literals work as well
// to match against the entire input use ^ and $ in the regex
let result = ctx
.sql("select regexp_match('John Smith', '^.*Smith$'), regexp_match('Smith Jones', '^Smith.*$')")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+----------------------------------------------------+-----------------------------------------------------+",
"| regexp_match(Utf8(\"John Smith\"),Utf8(\"^.*Smith$\")) | regexp_match(Utf8(\"Smith Jones\"),Utf8(\"^Smith.*$\")) |",
"+----------------------------------------------------+-----------------------------------------------------+",
"| [John Smith] | [Smith Jones] |",
"+----------------------------------------------------+-----------------------------------------------------+",
],
&result
);

//
//
//regexp_replace examples
//
//
// regexp_replace format is (regexp_replace(text, regex, replace[, flags])
//

// use regexp_replace function against tables
let result = ctx
.sql("SELECT regexp_replace(values, patterns, replacement, concat('g', flags)) FROM examples")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+---------------------------------------------------------------------------------------------------------+",
"| regexp_replace(examples.values,examples.patterns,examples.replacement,concat(Utf8(\"g\"),examples.flags)) |",
"+---------------------------------------------------------------------------------------------------------+",
"| bbabbbc |",
"| B |",
"| aec |",
"| AbC |",
"| aBC |",
"| 4000 |",
"| xyz |",
"| München |",
"| Moscow |",
"| Koln |",
"| Today |",
"+---------------------------------------------------------------------------------------------------------+",
],
&result
);

// global flag example
let result = ctx
.sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g')")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+------------------------------------------------------------------------+",
"| regexp_replace(Utf8(\"foobarbaz\"),Utf8(\"b(..)\"),Utf8(\"X\\1Y\"),Utf8(\"g\")) |",
"+------------------------------------------------------------------------+",
"| fooXarYXazY |",
"+------------------------------------------------------------------------+",
],
&result
);

// without global flag
let result = ctx
.sql("SELECT regexp_replace('foobarbaz', 'b(..)', 'X\\1Y')")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+--------------------------------------------------------------+",
"| regexp_replace(Utf8(\"foobarbaz\"),Utf8(\"b(..)\"),Utf8(\"X\\1Y\")) |",
"+--------------------------------------------------------------+",
"| fooXarYbaz |",
"+--------------------------------------------------------------+",
],
&result
);

// null regex means null result
let result = ctx
.sql("SELECT regexp_replace('foobarbaz', NULL, 'X\\1Y', 'g')")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+---------------------------------------------------------------+",
"| regexp_replace(Utf8(\"foobarbaz\"),NULL,Utf8(\"X\\1Y\"),Utf8(\"g\")) |",
"+---------------------------------------------------------------+",
"| |",
"+---------------------------------------------------------------+",
],
&result
);

Ok(())
}
21 changes: 21 additions & 0 deletions datafusion/core/tests/dataframe/dataframe_functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,27 @@ async fn test_fn_md5() -> Result<()> {
Ok(())
}

#[tokio::test]
#[cfg(feature = "unicode_expressions")]
async fn test_fn_regexp_like() -> Result<()> {
let expr = regexp_like(vec![col("a"), lit("[a-z]")]);

let expected = [
"+-----------------------------------+",
"| regexp_like(test.a,Utf8(\"[a-z]\")) |",
"+-----------------------------------+",
"| true |",
"| true |",
"| true |",
"| true |",
"+-----------------------------------+",
];

assert_fn_batches!(expr, expected);

Ok(())
}

#[tokio::test]
#[cfg(feature = "unicode_expressions")]
async fn test_fn_regexp_match() -> Result<()> {
Expand Down
Loading

0 comments on commit 30d2be9

Please sign in to comment.