From 6f1a102289fcf2e71faa4b7ddf8b4d29fe821693 Mon Sep 17 00:00:00 2001 From: psvri Date: Fri, 25 Nov 2022 18:34:44 +0000 Subject: [PATCH] Adding scalar nlike_dyn, ilike_dyn, nilike_dyn kernels --- arrow/src/compute/kernels/comparison.rs | 344 ++++++++++++++++++------ 1 file changed, 269 insertions(+), 75 deletions(-) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 7423b13bc07c..bca74bee0a1d 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -410,6 +410,39 @@ fn nlike_scalar<'a, L: ArrayAccessor>( like_scalar_op(left, right, |x| !x) } +/// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nlike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + nlike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + nlike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + nlike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nlike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + /// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -556,6 +589,39 @@ fn ilike_scalar<'a, L: ArrayAccessor>( Ok(BooleanArray::from(data)) } +/// Perform SQL `left ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn ilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + ilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + ilike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + ilike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "ilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + /// Perform SQL `left ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -702,6 +768,39 @@ fn nilike_scalar<'a, L: ArrayAccessor>( Ok(BooleanArray::from(data)) } +/// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / +/// [`LargeStringArray`], or [`DictionaryArray`] with values +/// [`StringArray`]/[`LargeStringArray`] and a scalar. +/// +/// See the documentation on [`like_utf8`] for more details. +pub fn nilike_utf8_scalar_dyn(left: &dyn Array, right: &str) -> Result { + match left.data_type() { + DataType::Utf8 => { + let left = as_string_array(left); + nilike_scalar(left, right) + } + DataType::LargeUtf8 => { + let left = as_largestring_array(left); + nilike_scalar(left, right) + } + DataType::Dictionary(_, _) => { + downcast_dictionary_array!( + left => { + nilike_dict_scalar(left, right) + } + t => Err(ArrowError::ComputeError(format!( + "Should be DictionaryArray but got: {}", t + ))) + ) + } + _ => { + Err(ArrowError::ComputeError( + "nilike_utf8_scalar_dyn only supports Utf8, LargeUtf8 or DictionaryArray with Utf8 or LargeUtf8 values".to_string(), + )) + } + } +} + /// Perform SQL `left NOT ILIKE right` operation on [`StringArray`] / /// [`LargeStringArray`] and a scalar. /// @@ -4410,6 +4509,10 @@ mod tests { } } }; + ($test_name:ident, $test_name_dyn:ident, $left:expr, $right:expr, $op:expr, $op_dyn:expr, $expected:expr) => { + test_utf8_scalar!($test_name, $left, $right, $op, $expected); + test_utf8_scalar!($test_name_dyn, $left, $right, $op_dyn, $expected); + }; } macro_rules! test_flag_utf8 { @@ -4498,160 +4601,100 @@ mod tests { test_utf8_scalar!( test_utf8_array_like_scalar_escape_testing, - vec!["varchar(255)", "int(255)", "varchar", "int"], - "%(%)%", - like_utf8_scalar, - vec![true, true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_escape_testing, vec!["varchar(255)", "int(255)", "varchar", "int"], "%(%)%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_escape_regex, - vec![".*", "a", "*"], - ".*", - like_utf8_scalar, - vec![true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_escape_regex, vec![".*", "a", "*"], ".*", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_escape_regex_dot, - vec![".", "a", "*"], - ".", - like_utf8_scalar, - vec![true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_escape_regex_dot, vec![".", "a", "*"], ".", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar, - vec!["arrow", "parquet", "datafusion", "flight"], - "%ar%", - like_utf8_scalar, - vec![true, true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn, vec!["arrow", "parquet", "datafusion", "flight"], "%ar%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_start, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow%", - like_utf8_scalar, - vec![true, false, true, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_start, vec!["arrow", "parrow", "arrows", "arr"], "arrow%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false, true, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_end, - vec!["arrow", "parrow", "arrows", "arr"], - "%arrow", - like_utf8_scalar, - vec![true, true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_end, vec!["arrow", "parrow", "arrows", "arr"], "%arrow", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_equals, - vec!["arrow", "parrow", "arrows", "arr"], - "arrow", - like_utf8_scalar, - vec![true, false, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_equals, vec!["arrow", "parrow", "arrows", "arr"], "arrow", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false, false, false] ); test_utf8_scalar!( test_utf8_array_like_scalar_one, - vec!["arrow", "arrows", "parrow", "arr"], - "arrow_", - like_utf8_scalar, - vec![false, true, false, false] - ); - - test_utf8_scalar!( test_utf8_array_like_scalar_dyn_one, vec!["arrow", "arrows", "parrow", "arr"], "arrow_", + like_utf8_scalar, like_utf8_scalar_dyn, vec![false, true, false, false] ); test_utf8_scalar!( test_utf8_scalar_like_escape, - vec!["a%", "a\\x"], - "a\\%", - like_utf8_scalar, - vec![true, false] - ); - - test_utf8_scalar!( test_utf8_scalar_like_dyn_escape, vec!["a%", "a\\x"], "a\\%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false] ); test_utf8_scalar!( test_utf8_scalar_like_escape_contains, - vec!["ba%", "ba\\x"], - "%a\\%", - like_utf8_scalar, - vec![true, false] - ); - - test_utf8_scalar!( test_utf8_scalar_like_dyn_escape_contains, vec!["ba%", "ba\\x"], "%a\\%", + like_utf8_scalar, like_utf8_scalar_dyn, vec![true, false] ); @@ -4716,64 +4759,80 @@ mod tests { ); test_utf8_scalar!( test_utf8_array_nlike_escape_testing, + test_utf8_array_nlike_escape_dyn_testing_dyn, vec!["varchar(255)", "int(255)", "varchar", "int"], "%(%)%", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_escape_regex, + test_utf8_array_nlike_scalar_dyn_escape_regex, vec![".*", "a", "*"], ".*", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_escape_regex_dot, + test_utf8_array_nlike_scalar_dyn_escape_regex_dot, vec![".", "a", "*"], ".", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar, + test_utf8_array_nlike_scalar_dyn, vec!["arrow", "parquet", "datafusion", "flight"], "%ar%", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_start, + test_utf8_array_nlike_scalar_dyn_start, vec!["arrow", "parrow", "arrows", "arr"], "arrow%", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, true, false, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_end, + test_utf8_array_nlike_scalar_dyn_end, vec!["arrow", "parrow", "arrows", "arr"], "%arrow", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_equals, + test_utf8_array_nlike_scalar_dyn_equals, vec!["arrow", "parrow", "arrows", "arr"], "arrow", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![false, true, true, true] ); test_utf8_scalar!( test_utf8_array_nlike_scalar_one, + test_utf8_array_nlike_scalar_dyn_one, vec!["arrow", "arrows", "parrow", "arr"], "arrow_", nlike_utf8_scalar, + nlike_utf8_scalar_dyn, vec![true, false, true, true] ); @@ -4784,50 +4843,64 @@ mod tests { ilike_utf8, vec![true, true, true, false, false, true, false] ); + test_utf8_scalar!( ilike_utf8_scalar_escape_testing, + ilike_utf8_scalar_escape_dyn_testing, vec!["varchar(255)", "int(255)", "varchar", "int"], "%(%)%", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, true, false, false] ); + test_utf8_scalar!( test_utf8_array_ilike_scalar, + test_utf8_array_ilike_dyn_scalar, vec!["arrow", "parquet", "datafusion", "flight"], "%AR%", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_start, + test_utf8_array_ilike_scalar_dyn_start, vec!["arrow", "parrow", "arrows", "ARR"], "aRRow%", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, false, true, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_end, + test_utf8_array_ilike_scalar_dyn_end, vec!["ArroW", "parrow", "ARRowS", "arr"], "%arrow", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, true, false, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_equals, + test_utf8_array_ilike_scalar_dyn_equals, vec!["arrow", "parrow", "arrows", "arr"], "Arrow", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![true, false, false, false] ); test_utf8_scalar!( test_utf8_array_ilike_scalar_one, + test_utf8_array_ilike_scalar_dyn_one, vec!["arrow", "arrows", "parrow", "arr"], "arrow_", ilike_utf8_scalar, + ilike_utf8_scalar_dyn, vec![false, true, false, false] ); @@ -4838,50 +4911,64 @@ mod tests { nilike_utf8, vec![false, false, false, true, true, false, true] ); + test_utf8_scalar!( nilike_utf8_scalar_escape_testing, + nilike_utf8_scalar_escape_dyn_testing, vec!["varchar(255)", "int(255)", "varchar", "int"], "%(%)%", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, false, true, true] ); + test_utf8_scalar!( test_utf8_array_nilike_scalar, + test_utf8_array_nilike_dyn_scalar, vec!["arrow", "parquet", "datafusion", "flight"], "%AR%", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_start, + test_utf8_array_nilike_scalar_dyn_start, vec!["arrow", "parrow", "arrows", "ARR"], "aRRow%", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, true, false, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_end, + test_utf8_array_nilike_scalar_dyn_end, vec!["ArroW", "parrow", "ARRowS", "arr"], "%arrow", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, false, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_equals, + test_utf8_array_nilike_scalar_dyn_equals, vec!["arRow", "parrow", "arrows", "arr"], "Arrow", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![false, true, true, true] ); test_utf8_scalar!( test_utf8_array_nilike_scalar_one, + test_utf8_array_nilike_scalar_dyn_one, vec!["arrow", "arrows", "parrow", "arr"], "arrow_", nilike_utf8_scalar, + nilike_utf8_scalar_dyn, vec![true, false, true, true] ); @@ -6252,11 +6339,7 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); - let data = - vec![Some("Earth"), Some("Fire"), Some("Water"), Some("Air"), None, Some("Air")]; - - let dict_arrayref: DictionaryArray = data.into_iter().collect(); - let dict_arrayref = Arc::new(dict_arrayref) as ArrayRef; + let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; assert_eq!( like_dict_scalar(&dict_array, "Air").unwrap(), @@ -6548,6 +6631,8 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); + let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; + assert_eq!( nlike_dict_scalar(&dict_array, "Air").unwrap(), BooleanArray::from( @@ -6555,6 +6640,13 @@ mod tests { ), ); + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "Air").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( nlike_dict_scalar(&dict_array, "Wa%").unwrap(), BooleanArray::from( @@ -6562,6 +6654,13 @@ mod tests { ), ); + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "Wa%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( nlike_dict_scalar(&dict_array, "%r").unwrap(), BooleanArray::from( @@ -6569,6 +6668,13 @@ mod tests { ), ); + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "%r").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] + ), + ); + assert_eq!( nlike_dict_scalar(&dict_array, "%i%").unwrap(), BooleanArray::from( @@ -6576,12 +6682,26 @@ mod tests { ), ); + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "%i%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( nlike_dict_scalar(&dict_array, "%a%r%").unwrap(), BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] ), ); + + assert_eq!( + nlike_utf8_scalar_dyn(&dict_arrayref, "%a%r%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); } #[test] @@ -6591,6 +6711,8 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); + let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; + assert_eq!( ilike_dict_scalar(&dict_array, "air").unwrap(), BooleanArray::from( @@ -6598,6 +6720,13 @@ mod tests { ), ); + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( ilike_dict_scalar(&dict_array, "wa%").unwrap(), BooleanArray::from( @@ -6605,6 +6734,13 @@ mod tests { ), ); + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( ilike_dict_scalar(&dict_array, "%R").unwrap(), BooleanArray::from( @@ -6612,6 +6748,13 @@ mod tests { ), ); + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(), + BooleanArray::from( + vec![Some(false), Some(false), Some(true), Some(true), None, Some(true)] + ), + ); + assert_eq!( ilike_dict_scalar(&dict_array, "%I%").unwrap(), BooleanArray::from( @@ -6619,12 +6762,26 @@ mod tests { ), ); + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( ilike_dict_scalar(&dict_array, "%A%r%").unwrap(), BooleanArray::from( vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] ), ); + + assert_eq!( + ilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(true), None, Some(true)] + ), + ); } #[test] @@ -6634,6 +6791,8 @@ mod tests { let dict_array: DictionaryArray = data.into_iter().collect(); + let dict_arrayref = Arc::new(dict_array.clone()) as ArrayRef; + assert_eq!( nilike_dict_scalar(&dict_array, "air").unwrap(), BooleanArray::from( @@ -6641,6 +6800,13 @@ mod tests { ), ); + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "air").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( nilike_dict_scalar(&dict_array, "wa%").unwrap(), BooleanArray::from( @@ -6648,6 +6814,13 @@ mod tests { ), ); + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "wa%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(true), None, Some(true)] + ), + ); + assert_eq!( nilike_dict_scalar(&dict_array, "%R").unwrap(), BooleanArray::from( @@ -6655,6 +6828,13 @@ mod tests { ), ); + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "%R").unwrap(), + BooleanArray::from( + vec![Some(true), Some(true), Some(false), Some(false), None, Some(false)] + ), + ); + assert_eq!( nilike_dict_scalar(&dict_array, "%I%").unwrap(), BooleanArray::from( @@ -6662,12 +6842,26 @@ mod tests { ), ); + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "%I%").unwrap(), + BooleanArray::from( + vec![Some(true), Some(false), Some(true), Some(false), None, Some(false)] + ), + ); + assert_eq!( nilike_dict_scalar(&dict_array, "%A%r%").unwrap(), BooleanArray::from( vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] ), ); + + assert_eq!( + nilike_utf8_scalar_dyn(&dict_arrayref, "%A%r%").unwrap(), + BooleanArray::from( + vec![Some(false), Some(true), Some(false), Some(false), None, Some(false)] + ), + ); } #[test]