diff --git a/arrow/src/compute/kernels/mod.rs b/arrow/src/compute/kernels/mod.rs index a0ef50a7b85a..73fef5b3d457 100644 --- a/arrow/src/compute/kernels/mod.rs +++ b/arrow/src/compute/kernels/mod.rs @@ -31,6 +31,7 @@ pub mod limit; pub mod partition; pub mod regexp; pub mod sort; +pub mod string; pub mod substring; pub mod take; pub mod temporal; diff --git a/arrow/src/compute/kernels/string.rs b/arrow/src/compute/kernels/string.rs new file mode 100644 index 000000000000..2af0bf85aab0 --- /dev/null +++ b/arrow/src/compute/kernels/string.rs @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::*; +use crate::compute::util::combine_option_bitmap; +use crate::error::{ArrowError, Result}; + +/// Returns the elementwise concatenation of `StringArray`. +/// +/// An index of the resulting `StringArray` is null if any of `StringArray` are null at that location. +/// +/// ```text +/// e.g: +/// +/// ["Hello"] + ["World"] = ["HelloWorld"] +/// +/// ["a", "b"] + [None, "c"] = [None, "bc"] +/// ``` +/// +/// Attention: `left` and `right` must have the same length. +pub fn string_concat( + left: &GenericStringArray, + right: &GenericStringArray, +) -> Result> { + if left.len() != right.len() { + return Err(ArrowError::ComputeError(format!( + "Arrays must have the same length: {} != {}", + left.len(), + right.len() + ))); + } + + let output_bitmap = combine_option_bitmap(left.data(), right.data(), left.len())?; + + let left_offsets = left.value_offsets(); + let right_offsets = right.value_offsets(); + + let left_buffer = left.value_data(); + let right_buffer = right.value_data(); + let left_values = left_buffer.as_slice(); + let right_values = right_buffer.as_slice(); + + let mut output_values = BufferBuilder::::new( + left_values.len() + right_values.len() + - left_offsets[0].to_usize().unwrap() + - right_offsets[0].to_usize().unwrap(), + ); + + let mut output_offsets = BufferBuilder::::new(left_offsets.len()); + output_offsets.append(Offset::zero()); + for (left_idx, right_idx) in left_offsets.windows(2).zip(right_offsets.windows(2)) { + output_values.append_slice( + &left_values + [left_idx[0].to_usize().unwrap()..left_idx[1].to_usize().unwrap()], + ); + output_values.append_slice( + &right_values + [right_idx[0].to_usize().unwrap()..right_idx[1].to_usize().unwrap()], + ); + output_offsets.append(Offset::from_usize(output_values.len()).unwrap()); + } + + let mut builder = + ArrayDataBuilder::new(GenericStringArray::::get_data_type()) + .len(left.len()) + .add_buffer(output_offsets.finish()) + .add_buffer(output_values.finish()); + + if let Some(null_bitmap) = output_bitmap { + builder = builder.null_bit_buffer(null_bitmap); + } + + // SAFETY - offsets valid by construction + Ok(unsafe { builder.build_unchecked() }.into()) +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_string_concat() { + let left = [Some("foo"), Some("bar"), None] + .into_iter() + .collect::(); + let right = [None, Some("yyy"), Some("zzz")] + .into_iter() + .collect::(); + + let output = string_concat(&left, &right).unwrap(); + + let expected = [None, Some("baryyy"), None] + .into_iter() + .collect::(); + + assert_eq!(output, expected); + } + + #[test] + fn test_string_concat_empty_string() { + let left = [Some("foo"), Some(""), Some("bar")] + .into_iter() + .collect::(); + let right = [Some("baz"), Some(""), Some("")] + .into_iter() + .collect::(); + + let output = string_concat(&left, &right).unwrap(); + + let expected = [Some("foobaz"), Some(""), Some("bar")] + .into_iter() + .collect::(); + + assert_eq!(output, expected); + } + + #[test] + fn test_string_concat_no_null() { + let left = StringArray::from(vec!["foo", "bar"]); + let right = StringArray::from(vec!["bar", "baz"]); + + let output = string_concat(&left, &right).unwrap(); + + let expected = StringArray::from(vec!["foobar", "barbaz"]); + + assert_eq!(output, expected); + } + + #[test] + fn test_string_concat_error() { + let left = StringArray::from(vec!["foo", "bar"]); + let right = StringArray::from(vec!["baz"]); + + let output = string_concat(&left, &right); + + assert!(output.is_err()); + } + + #[test] + fn test_string_concat_slice() { + let left = &StringArray::from(vec![None, Some("foo"), Some("bar"), Some("baz")]); + let right = &StringArray::from(vec![Some("boo"), None, Some("far"), Some("faz")]); + + let left_slice = left.slice(0, 3); + let right_slice = right.slice(1, 3); + let output = string_concat( + left_slice + .as_any() + .downcast_ref::>() + .unwrap(), + right_slice + .as_any() + .downcast_ref::>() + .unwrap(), + ) + .unwrap(); + + let expected = [None, Some("foofar"), Some("barfaz")] + .into_iter() + .collect::(); + + assert_eq!(output, expected); + + let left_slice = left.slice(2, 2); + let right_slice = right.slice(1, 2); + + let output = string_concat( + left_slice + .as_any() + .downcast_ref::>() + .unwrap(), + right_slice + .as_any() + .downcast_ref::>() + .unwrap(), + ) + .unwrap(); + + let expected = [None, Some("bazfar")].into_iter().collect::(); + + assert_eq!(output, expected); + } +}