From bc751794b7a82bb2639b63d8065dfb25a951f458 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= <git@jhorstmann.net>
Date: Sat, 15 Jan 2022 15:42:09 +0100
Subject: [PATCH 1/3] Extend arithmetic benchmarks

---
 arrow/benches/arithmetic_kernels.rs     | 62 +++++++++++++++++--------
 arrow/src/buffer/immutable.rs           |  1 +
 arrow/src/compute/kernels/arithmetic.rs | 16 +++++++
 3 files changed, 60 insertions(+), 19 deletions(-)
diff --git a/arrow/benches/arithmetic_kernels.rs b/arrow/benches/arithmetic_kernels.rs
index bbe412366988..4be4a26933aa 100644
--- a/arrow/benches/arithmetic_kernels.rs
+++ b/arrow/benches/arithmetic_kernels.rs
@@ -24,7 +24,6 @@ use std::sync::Arc;
 
 extern crate arrow;
 
-use arrow::compute::kernels::limit::*;
 use arrow::util::bench_util::*;
 use arrow::{array::*, datatypes::Float32Type};
 use arrow::{compute::kernels::arithmetic::*, util::test_util::seedable_rng};
@@ -59,44 +58,69 @@ fn bench_divide(arr_a: &ArrayRef, arr_b: &ArrayRef) {
     criterion::black_box(divide(arr_a, arr_b).unwrap());
 }
 
+fn bench_divide_unchecked(arr_a: &ArrayRef, arr_b: &ArrayRef) {
+    let arr_a = arr_a.as_any().downcast_ref::<Float32Array>().unwrap();
+    let arr_b = arr_b.as_any().downcast_ref::<Float32Array>().unwrap();
+    criterion::black_box(divide_unchecked(arr_a, arr_b).unwrap());
+}
+
 fn bench_divide_scalar(array: &ArrayRef, divisor: f32) {
     let array = array.as_any().downcast_ref::<Float32Array>().unwrap();
     criterion::black_box(divide_scalar(array, divisor).unwrap());
 }
 
-fn bench_limit(arr_a: &ArrayRef, max: usize) {
-    criterion::black_box(limit(arr_a, max));
+fn bench_modulo(arr_a: &ArrayRef, arr_b: &ArrayRef) {
+    let arr_a = arr_a.as_any().downcast_ref::<Float32Array>().unwrap();
+    let arr_b = arr_b.as_any().downcast_ref::<Float32Array>().unwrap();
+    criterion::black_box(modulus(arr_a, arr_b).unwrap());
+}
+
+fn bench_modulo_scalar(array: &ArrayRef, divisor: f32) {
+    let array = array.as_any().downcast_ref::<Float32Array>().unwrap();
+    criterion::black_box(modulus_scalar(array, divisor).unwrap());
 }
 
 fn add_benchmark(c: &mut Criterion) {
-    let arr_a = create_array(512, false);
-    let arr_b = create_array(512, false);
+    const BATCH_SIZE: usize = 64 * 1024;
+    let arr_a = create_array(BATCH_SIZE, false);
+    let arr_b = create_array(BATCH_SIZE, false);
     let scalar = seedable_rng().gen();
 
-    c.bench_function("add 512", |b| b.iter(|| bench_add(&arr_a, &arr_b)));
-    c.bench_function("subtract 512", |b| {
-        b.iter(|| bench_subtract(&arr_a, &arr_b))
+    c.bench_function("add", |b| b.iter(|| bench_add(&arr_a, &arr_b)));
+    c.bench_function("subtract", |b| b.iter(|| bench_subtract(&arr_a, &arr_b)));
+    c.bench_function("multiply", |b| b.iter(|| bench_multiply(&arr_a, &arr_b)));
+    c.bench_function("divide", |b| b.iter(|| bench_divide(&arr_a, &arr_b)));
+    c.bench_function("divide_unchecked", |b| {
+        b.iter(|| bench_divide_unchecked(&arr_a, &arr_b))
     });
-    c.bench_function("multiply 512", |b| {
-        b.iter(|| bench_multiply(&arr_a, &arr_b))
-    });
-    c.bench_function("divide 512", |b| b.iter(|| bench_divide(&arr_a, &arr_b)));
-    c.bench_function("divide_scalar 512", |b| {
+    c.bench_function("divide_scalar", |b| {
         b.iter(|| bench_divide_scalar(&arr_a, scalar))
     });
-    c.bench_function("limit 512, 512", |b| b.iter(|| bench_limit(&arr_a, 512)));
+    c.bench_function("modulo", |b| b.iter(|| bench_modulo(&arr_a, &arr_b)));
+    c.bench_function("modulo_scalar", |b| {
+        b.iter(|| bench_modulo_scalar(&arr_a, scalar))
+    });
 
-    let arr_a_nulls = create_array(512, false);
-    let arr_b_nulls = create_array(512, false);
-    c.bench_function("add_nulls_512", |b| {
+    let arr_a_nulls = create_array(BATCH_SIZE, true);
+    let arr_b_nulls = create_array(BATCH_SIZE, true);
+    c.bench_function("add_nulls", |b| {
         b.iter(|| bench_add(&arr_a_nulls, &arr_b_nulls))
     });
-    c.bench_function("divide_nulls_512", |b| {
+    c.bench_function("divide_nulls", |b| {
         b.iter(|| bench_divide(&arr_a_nulls, &arr_b_nulls))
     });
-    c.bench_function("divide_scalar_nulls_512", |b| {
+    c.bench_function("divide_nulls_unchecked", |b| {
+        b.iter(|| bench_divide_unchecked(&arr_a_nulls, &arr_b_nulls))
+    });
+    c.bench_function("divide_scalar_nulls", |b| {
         b.iter(|| bench_divide_scalar(&arr_a_nulls, scalar))
     });
+    c.bench_function("modulo_nulls", |b| {
+        b.iter(|| bench_modulo(&arr_a_nulls, &arr_b_nulls))
+    });
+    c.bench_function("modulo_scalar_nulls", |b| {
+        b.iter(|| bench_modulo_scalar(&arr_a_nulls, scalar))
+    });
 }
 
 criterion_group!(benches, add_benchmark);
diff --git a/arrow/src/buffer/immutable.rs b/arrow/src/buffer/immutable.rs
index 0823de54f3fe..a3de0d496346 100644
--- a/arrow/src/buffer/immutable.rs
+++ b/arrow/src/buffer/immutable.rs
@@ -153,6 +153,7 @@ impl Buffer {
     ///
     /// Note that this should be used cautiously, and the returned pointer should not be
     /// stored anywhere, to avoid dangling pointers.
+    #[inline]
     pub fn as_ptr(&self) -> *const u8 {
         unsafe { self.data.ptr().as_ptr().add(self.offset) }
     }
diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs
index 4eb957678892..71ed7af324e6 100644
--- a/arrow/src/compute/kernels/arithmetic.rs
+++ b/arrow/src/compute/kernels/arithmetic.rs
@@ -756,6 +756,22 @@ where
     return math_checked_divide_op(left, right, |a, b| a / b);
 }
 
+/// Perform `left / right` operation on two arrays without checking for division by zero.
+/// If either left or right value is null then the result is also null. If any right hand value is zero then the result of this
+pub fn divide_unchecked<T>(
+    left: &PrimitiveArray<T>,
+    right: &PrimitiveArray<T>,
+) -> Result<PrimitiveArray<T>>
+where
+    T: datatypes::ArrowFloatNumericType,
+    T::Native: Div<Output = T::Native>,
+{
+    #[cfg(feature = "simd")]
+    return simd_math_op(&left, &right, |a, b| a / b, |a, b| a / b);
+    #[cfg(not(feature = "simd"))]
+    return math_op(left, right, |a, b| a / b);
+}
+
 /// Modulus every value in an array by a scalar. If any value in the array is null then the
 /// result is also null. If the scalar is zero then the result of this operation will be
 /// `Err(ArrowError::DivideByZero)`.

From 551be27542b95b307ae2c647c04f2e881b9734c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= <git@jhorstmann.net>
Date: Sat, 22 Jan 2022 16:07:23 +0100
Subject: [PATCH 2/3] Remove explicit simd arithmetic except for div/mod
 because autovectorization generates better code

---
 arrow/src/compute/kernels/arithmetic.rs | 212 +-----------------------
 1 file changed, 1 insertion(+), 211 deletions(-)

diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs
index 71ed7af324e6..397ae159dd30 100644
--- a/arrow/src/compute/kernels/arithmetic.rs
+++ b/arrow/src/compute/kernels/arithmetic.rs
@@ -29,7 +29,6 @@ use num::{One, Zero};
 use crate::buffer::Buffer;
 #[cfg(feature = "simd")]
 use crate::buffer::MutableBuffer;
-#[cfg(not(feature = "simd"))]
 use crate::compute::kernels::arity::unary;
 use crate::compute::util::combine_option_bitmap;
 use crate::datatypes;
@@ -42,65 +41,6 @@ use std::borrow::BorrowMut;
 #[cfg(feature = "simd")]
 use std::slice::{ChunksExact, ChunksExactMut};
 
-/// Creates a new PrimitiveArray by applying `simd_op` to the input `array`.
-/// If the length of the array is not multiple of the number of vector lanes
-/// then the remainder of the array will be calculated using `scalar_op`.
-/// Any operation on a `NULL` value will result in a `NULL` value in the output.
-#[cfg(feature = "simd")]
-fn simd_unary_math_op<T, SIMD_OP, SCALAR_OP>(
-    array: &PrimitiveArray<T>,
-    simd_op: SIMD_OP,
-    scalar_op: SCALAR_OP,
-) -> PrimitiveArray<T>
-where
-    T: ArrowNumericType,
-    SIMD_OP: Fn(T::Simd) -> T::Simd,
-    SCALAR_OP: Fn(T::Native) -> T::Native,
-{
-    let lanes = T::lanes();
-    let buffer_size = array.len() * std::mem::size_of::<T::Native>();
-
-    let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);
-
-    // safety: result is newly created above, always written as a T below
-    let mut result_chunks = unsafe { result.typed_data_mut().chunks_exact_mut(lanes) };
-    let mut array_chunks = array.values().chunks_exact(lanes);
-
-    result_chunks
-        .borrow_mut()
-        .zip(array_chunks.borrow_mut())
-        .for_each(|(result_slice, input_slice)| {
-            let simd_input = T::load(input_slice);
-            let simd_result = T::unary_op(simd_input, &simd_op);
-            T::write(simd_result, result_slice);
-        });
-
-    let result_remainder = result_chunks.into_remainder();
-    let array_remainder = array_chunks.remainder();
-
-    result_remainder.into_iter().zip(array_remainder).for_each(
-        |(scalar_result, scalar_input)| {
-            *scalar_result = scalar_op(*scalar_input);
-        },
-    );
-
-    let data = unsafe {
-        ArrayData::new_unchecked(
-            T::DATA_TYPE,
-            array.len(),
-            None,
-            array
-                .data_ref()
-                .null_buffer()
-                .map(|b| b.bit_slice(array.offset(), array.len())),
-            0,
-            vec![result.into()],
-            vec![],
-        )
-    };
-    PrimitiveArray::<T>::from(data)
-}
-
 /// Helper function to perform math lambda function on values from two arrays. If either
 /// left or right value is null then the output value is also null, so `1 + null` is
 /// `null`.
@@ -227,79 +167,6 @@ where
     Ok(PrimitiveArray::<T>::from(data))
 }
 
-/// Creates a new PrimitiveArray by applying `simd_op` to the `left` and `right` input arrays.
-/// If the length of the arrays is not multiple of the number of vector lanes
-/// then the remainder of the array will be calculated using `scalar_op`.
-/// Any operation on a `NULL` value will result in a `NULL` value in the output.
-///
-/// # Errors
-///
-/// This function errors if the arrays have different lengths
-#[cfg(feature = "simd")]
-fn simd_math_op<T, SIMD_OP, SCALAR_OP>(
-    left: &PrimitiveArray<T>,
-    right: &PrimitiveArray<T>,
-    simd_op: SIMD_OP,
-    scalar_op: SCALAR_OP,
-) -> Result<PrimitiveArray<T>>
-where
-    T: ArrowNumericType,
-    SIMD_OP: Fn(T::Simd, T::Simd) -> T::Simd,
-    SCALAR_OP: Fn(T::Native, T::Native) -> T::Native,
-{
-    if left.len() != right.len() {
-        return Err(ArrowError::ComputeError(
-            "Cannot perform math operation on arrays of different length".to_string(),
-        ));
-    }
-
-    let null_bit_buffer =
-        combine_option_bitmap(left.data_ref(), right.data_ref(), left.len())?;
-
-    let lanes = T::lanes();
-    let buffer_size = left.len() * std::mem::size_of::<T::Native>();
-    let mut result = MutableBuffer::new(buffer_size).with_bitset(buffer_size, false);
-
-    // safety: result is newly created above, always written as a T below
-    let mut result_chunks = unsafe { result.typed_data_mut().chunks_exact_mut(lanes) };
-    let mut left_chunks = left.values().chunks_exact(lanes);
-    let mut right_chunks = right.values().chunks_exact(lanes);
-
-    result_chunks
-        .borrow_mut()
-        .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
-        .for_each(|(result_slice, (left_slice, right_slice))| {
-            let simd_left = T::load(left_slice);
-            let simd_right = T::load(right_slice);
-            let simd_result = T::bin_op(simd_left, simd_right, &simd_op);
-            T::write(simd_result, result_slice);
-        });
-
-    let result_remainder = result_chunks.into_remainder();
-    let left_remainder = left_chunks.remainder();
-    let right_remainder = right_chunks.remainder();
-
-    result_remainder
-        .iter_mut()
-        .zip(left_remainder.iter().zip(right_remainder.iter()))
-        .for_each(|(scalar_result, (scalar_left, scalar_right))| {
-            *scalar_result = scalar_op(*scalar_left, *scalar_right);
-        });
-
-    let data = unsafe {
-        ArrayData::new_unchecked(
-            T::DATA_TYPE,
-            left.len(),
-            None,
-            null_bit_buffer,
-            0,
-            vec![result.into()],
-            vec![],
-        )
-    };
-    Ok(PrimitiveArray::<T>::from(data))
-}
-
 /// Calculates the modulus operation `left % right` on two SIMD inputs.
 /// The lower-most bits of `valid_mask` specify which vector lanes are considered as valid.
 ///
@@ -566,9 +433,6 @@ where
     T: ArrowNumericType,
     T::Native: Add<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    return simd_math_op(&left, &right, |a, b| a + b, |a, b| a + b);
-    #[cfg(not(feature = "simd"))]
     return math_op(left, right, |a, b| a + b);
 }
 
@@ -582,16 +446,6 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Add<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    {
-        let scalar_vector = T::init(scalar);
-        return Ok(simd_unary_math_op(
-            array,
-            |x| x + scalar_vector,
-            |x| x + scalar,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
     return Ok(unary(array, |value| value + scalar));
 }
 
@@ -605,9 +459,6 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Sub<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    return simd_math_op(&left, &right, |a, b| a - b, |a, b| a - b);
-    #[cfg(not(feature = "simd"))]
     return math_op(left, right, |a, b| a - b);
 }
 
@@ -625,16 +476,6 @@ where
         + Div<Output = T::Native>
         + Zero,
 {
-    #[cfg(feature = "simd")]
-    {
-        let scalar_vector = T::init(scalar);
-        return Ok(simd_unary_math_op(
-            array,
-            |x| x - scalar_vector,
-            |x| x - scalar,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
     return Ok(unary(array, |value| value - scalar));
 }
 
@@ -644,12 +485,6 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Neg<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    {
-        let zero_vector = T::init(T::default_value());
-        return Ok(simd_unary_math_op(array, |x| zero_vector - x, |x| -x));
-    }
-    #[cfg(not(feature = "simd"))]
     return Ok(unary(array, |x| -x));
 }
 
@@ -662,16 +497,6 @@ where
     T: datatypes::ArrowFloatNumericType,
     T::Native: Pow<T::Native, Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    {
-        let raise_vector = T::init(raise);
-        return Ok(simd_unary_math_op(
-            array,
-            |x| T::pow(x, raise_vector),
-            |x| x.pow(raise),
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
     return Ok(unary(array, |x| x.pow(raise)));
 }
 
@@ -685,9 +510,6 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Mul<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    return simd_math_op(&left, &right, |a, b| a * b, |a, b| a * b);
-    #[cfg(not(feature = "simd"))]
     return math_op(left, right, |a, b| a * b);
 }
 
@@ -707,16 +529,6 @@ where
         + Zero
         + One,
 {
-    #[cfg(feature = "simd")]
-    {
-        let scalar_vector = T::init(scalar);
-        return Ok(simd_unary_math_op(
-            array,
-            |x| x * scalar_vector,
-            |x| x * scalar,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
     return Ok(unary(array, |value| value * scalar));
 }
 
@@ -757,6 +569,7 @@ where
 }
 
 /// Perform `left / right` operation on two arrays without checking for division by zero.
+/// The result of dividing by zero follows normal floating point rules.
 /// If either left or right value is null then the result is also null. If any right hand value is zero then the result of this
 pub fn divide_unchecked<T>(
     left: &PrimitiveArray<T>,
@@ -766,9 +579,6 @@ where
     T: datatypes::ArrowFloatNumericType,
     T::Native: Div<Output = T::Native>,
 {
-    #[cfg(feature = "simd")]
-    return simd_math_op(&left, &right, |a, b| a / b, |a, b| a / b);
-    #[cfg(not(feature = "simd"))]
     return math_op(left, right, |a, b| a / b);
 }
 
@@ -787,16 +597,6 @@ where
         return Err(ArrowError::DivideByZero);
     }
 
-    #[cfg(feature = "simd")]
-    {
-        let modulo_vector = T::init(modulo);
-        return Ok(simd_unary_math_op(
-            &array,
-            |a| a % modulo_vector,
-            |a| a % modulo,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
     return Ok(unary(array, |a| a % modulo));
 }
 
@@ -814,16 +614,6 @@ where
     if divisor.is_zero() {
         return Err(ArrowError::DivideByZero);
     }
-    #[cfg(feature = "simd")]
-    {
-        let divisor_vector = T::init(divisor);
-        return Ok(simd_unary_math_op(
-            &array,
-            |a| a / divisor_vector,
-            |a| a / divisor,
-        ));
-    }
-    #[cfg(not(feature = "simd"))]
     return Ok(unary(array, |a| a / divisor));
 }
 

From ac6a599873e0547c0932726562f87ec210595cab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Horstmann?= <git@jhorstmann.net>
Date: Sat, 22 Jan 2022 16:54:28 +0100
Subject: [PATCH 3/3] Remove unneeded return keywords

---
 arrow/src/compute/kernels/arithmetic.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs
index 397ae159dd30..dcdfa952377d 100644
--- a/arrow/src/compute/kernels/arithmetic.rs
+++ b/arrow/src/compute/kernels/arithmetic.rs
@@ -433,7 +433,7 @@ where
     T: ArrowNumericType,
     T::Native: Add<Output = T::Native>,
 {
-    return math_op(left, right, |a, b| a + b);
+    math_op(left, right, |a, b| a + b)
 }
 
 /// Add every value in an array by a scalar. If any value in the array is null then the
@@ -446,7 +446,7 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Add<Output = T::Native>,
 {
-    return Ok(unary(array, |value| value + scalar));
+    Ok(unary(array, |value| value + scalar))
 }
 
 /// Perform `left - right` operation on two arrays. If either left or right value is null
@@ -459,7 +459,7 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Sub<Output = T::Native>,
 {
-    return math_op(left, right, |a, b| a - b);
+    math_op(left, right, |a, b| a - b)
 }
 
 /// Subtract every value in an array by a scalar. If any value in the array is null then the
@@ -476,7 +476,7 @@ where
         + Div<Output = T::Native>
         + Zero,
 {
-    return Ok(unary(array, |value| value - scalar));
+    Ok(unary(array, |value| value - scalar))
 }
 
 /// Perform `-` operation on an array. If value is null then the result is also null.
@@ -485,7 +485,7 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Neg<Output = T::Native>,
 {
-    return Ok(unary(array, |x| -x));
+    Ok(unary(array, |x| -x))
 }
 
 /// Raise array with floating point values to the power of a scalar.
@@ -497,7 +497,7 @@ where
     T: datatypes::ArrowFloatNumericType,
     T::Native: Pow<T::Native, Output = T::Native>,
 {
-    return Ok(unary(array, |x| x.pow(raise)));
+    Ok(unary(array, |x| x.pow(raise)))
 }
 
 /// Perform `left * right` operation on two arrays. If either left or right value is null
@@ -510,7 +510,7 @@ where
     T: datatypes::ArrowNumericType,
     T::Native: Mul<Output = T::Native>,
 {
-    return math_op(left, right, |a, b| a * b);
+    math_op(left, right, |a, b| a * b)
 }
 
 /// Multiply every value in an array by a scalar. If any value in the array is null then the
@@ -529,7 +529,7 @@ where
         + Zero
         + One,
 {
-    return Ok(unary(array, |value| value * scalar));
+    Ok(unary(array, |value| value * scalar))
 }
 
 /// Perform `left % right` operation on two arrays. If either left or right value is null
@@ -579,7 +579,7 @@ where
     T: datatypes::ArrowFloatNumericType,
     T::Native: Div<Output = T::Native>,
 {
-    return math_op(left, right, |a, b| a / b);
+    math_op(left, right, |a, b| a / b)
 }
 
 /// Modulus every value in an array by a scalar. If any value in the array is null then the
@@ -597,7 +597,7 @@ where
         return Err(ArrowError::DivideByZero);
     }
 
-    return Ok(unary(array, |a| a % modulo));
+    Ok(unary(array, |a| a % modulo))
 }
 
 /// Divide every value in an array by a scalar. If any value in the array is null then the
@@ -614,7 +614,7 @@ where
     if divisor.is_zero() {
         return Err(ArrowError::DivideByZero);
     }
-    return Ok(unary(array, |a| a / divisor));
+    Ok(unary(array, |a| a / divisor))
 }
 
 #[cfg(test)]