From d721a40ca6f1b536f60c452fa17305794a229b27 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 23 Jan 2023 20:16:41 +0000 Subject: [PATCH] Fix nullif null count (#3579) --- arrow-select/src/nullif.rs | 87 +++++++++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 20 deletions(-) diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 23a586f63652..40c54d0ce956 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -37,7 +37,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result Result ( buffer_bin_and( &right_data.buffers()[0], @@ -68,27 +68,26 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { - bitwise_bin_op_helper(left, left_offset, &right, right_offset, len, |l, r| { - let t = l & !r; - valid_count += t.count_ones() as usize; - t - }) + let mut valid_count = 0; + let b = + bitwise_bin_op_helper(left, l_offset, &right, r_offset, len, |l, r| { + let t = l & !r; + valid_count += t.count_ones() as usize; + t + }); + (b, len - valid_count) } None => { - let buffer = bitwise_unary_op_helper(&right, right_offset, len, |b| { + let mut null_count = 0; + let buffer = bitwise_unary_op_helper(&right, r_offset, len, |b| { let t = !b; - valid_count += t.count_ones() as usize; + null_count += t.count_zeros() as usize; t }); - // We need to compensate for the additional bits read from the end - let remainder_len = len % 64; - if remainder_len != 0 { - valid_count -= 64 - remainder_len - } - buffer + (buffer, null_count) } }; @@ -96,15 +95,14 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result combined, _ => { - let mut builder = BooleanBufferBuilder::new(len + left_offset); + let mut builder = BooleanBufferBuilder::new(len + l_offset); // Pad with 0s up to offset - builder.resize(left_offset); + builder.resize(l_offset); builder.append_packed_range(0..len, &combined); builder.finish() } }; - let null_count = len - valid_count; let data = left_data .clone() .into_builder() @@ -125,6 +123,7 @@ mod tests { use arrow_array::{Int32Array, StringArray, StructArray}; use arrow_data::ArrayData; use arrow_schema::{DataType, Field}; + use rand::{thread_rng, Rng}; #[test] fn test_nullif_int_array() { @@ -464,4 +463,52 @@ mod tests { let res = nullif(&a, &mask).unwrap(); assert_eq!(res.as_ref(), &a); } + + fn test_nullif(values: &Int32Array, filter: &BooleanArray) { + let expected: Int32Array = values + .iter() + .zip(filter.iter()) + .map(|(a, b)| match b { + Some(true) => None, + Some(false) | None => a, + }) + .collect(); + + let r = nullif(values, &filter).unwrap(); + r.data().validate().unwrap(); + + assert_eq!(expected.data(), r.data()); + } + + #[test] + fn nullif_fuzz() { + let mut rng = thread_rng(); + + let arrays = [ + Int32Array::from(vec![0; 128]), + (0..128).map(|_| rng.gen_bool(0.5).then(|| 0)).collect(), + ]; + + for a in arrays { + let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0), (32, 0)]; + + for (a_offset, a_length) in a_slices { + let a = a.slice(a_offset, a_length); + let a = as_primitive_array::(a.as_ref()); + + for i in 1..65 { + let b_start_offset = rng.gen_range(0..i); + let b_end_offset = rng.gen_range(0..i); + + let b: BooleanArray = (0..a_length + b_start_offset + b_end_offset) + .map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5))) + .collect(); + let b = b.slice(b_start_offset, a_length); + let b = as_boolean_array(b.as_ref()); + + test_nullif(a, &b); + } + } + } + } }