diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index be19fea37cc8..f8c948d6b572 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -377,15 +377,17 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { DataType::Null => Arc::new(NullArray::new(length)), DataType::Boolean => { let null_buf: Buffer = MutableBuffer::new_null(length).into(); - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(null_buf.clone()), - 0, - vec![null_buf], - vec![], - )) + unsafe { + make_array(ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(null_buf.clone()), + 0, + vec![null_buf], + vec![], + )) + } } DataType::Int8 => new_null_sized_array::(data_type, length), DataType::UInt8 => new_null_sized_array::(data_type, length), @@ -414,15 +416,17 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { new_null_sized_array::(data_type, length) } }, - DataType::FixedSizeBinary(value_len) => make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; *value_len as usize * length])], - vec![], - )), + DataType::FixedSizeBinary(value_len) => unsafe { + make_array(ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![Buffer::from(vec![0u8; *value_len as usize * length])], + vec![], + )) + }, DataType::Binary | DataType::Utf8 => { new_null_binary_array::(data_type, length) } @@ -435,19 +439,21 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { DataType::LargeList(field) => { new_null_list_array::(data_type, field.data_type(), length) } - DataType::FixedSizeList(field, value_len) => make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![], - vec![ - new_null_array(field.data_type(), *value_len as usize * length) - .data() - .clone(), - ], - )), + DataType::FixedSizeList(field, value_len) => unsafe { + make_array(ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![], + vec![ + new_null_array(field.data_type(), *value_len as usize * length) + .data() + .clone(), + ], + )) + }, DataType::Struct(fields) => { let fields: Vec<_> = fields .iter() @@ -467,15 +473,17 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { let keys = new_null_array(key, length); let keys = keys.data(); - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - keys.null_buffer().cloned(), - 0, - keys.buffers().into(), - vec![new_empty_array(value.as_ref()).data().clone()], - )) + unsafe { + make_array(ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + keys.null_buffer().cloned(), + 0, + keys.buffers().into(), + vec![new_empty_array(value.as_ref()).data().clone()], + )) + } } DataType::Decimal(_, _) => { unimplemented!("Creating null Decimal array not yet supported") @@ -489,17 +497,21 @@ fn new_null_list_array( child_data_type: &DataType, length: usize, ) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from( - vec![OffsetSize::zero(); length + 1].to_byte_slice(), - )], - vec![ArrayData::new_empty(child_data_type)], - )) + // Safety: buffers are created with the correct length + let data = unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![Buffer::from( + vec![OffsetSize::zero(); length + 1].to_byte_slice(), + )], + vec![ArrayData::new_empty(child_data_type)], + ) + }; + make_array(data) } #[inline] @@ -507,18 +519,21 @@ fn new_null_binary_array( data_type: &DataType, length: usize, ) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![ - Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()), - MutableBuffer::new(0).into(), - ], - vec![], - )) + let data = unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![ + Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()), + MutableBuffer::new(0).into(), + ], + vec![], + ) + }; + make_array(data) } #[inline] @@ -526,15 +541,19 @@ fn new_null_sized_array( data_type: &DataType, length: usize, ) -> ArrayRef { - make_array(ArrayData::new( - data_type.clone(), - length, - Some(length), - Some(MutableBuffer::new_null(length).into()), - 0, - vec![Buffer::from(vec![0u8; length * T::get_byte_width()])], - vec![], - )) + // Safety: buffers are created with the correct length + let data = unsafe { + ArrayData::new_unchecked( + data_type.clone(), + length, + Some(length), + Some(MutableBuffer::new_null(length).into()), + 0, + vec![Buffer::from(vec![0u8; length * T::get_byte_width()])], + vec![], + ) + }; + make_array(data) } /// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index b477fc6aa812..c8d95e1289e4 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -520,15 +520,17 @@ impl FixedSizeBinaryArray { } let size = size.unwrap_or(0); - let array_data = ArrayData::new( - DataType::FixedSizeBinary(size as i32), - len, - None, - Some(null_buf.into()), - 0, - vec![buffer.into()], - vec![], - ); + let array_data = unsafe { + ArrayData::new_unchecked( + DataType::FixedSizeBinary(size as i32), + len, + None, + Some(null_buf.into()), + 0, + vec![buffer.into()], + vec![], + ) + }; Ok(FixedSizeBinaryArray::from(array_data)) } diff --git a/arrow/src/array/array_boolean.rs b/arrow/src/array/array_boolean.rs index 9274e65c8d69..9969a0011c4f 100644 --- a/arrow/src/array/array_boolean.rs +++ b/arrow/src/array/array_boolean.rs @@ -212,14 +212,10 @@ impl>> FromIterator for BooleanArray { } }); - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( data_len, - None, Some(null_buf.into()), - 0, - vec![val_buf.into()], - vec![], + val_buf.into(), ); BooleanArray::from(data) } diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index de9873ccee5c..6763999d2976 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -130,15 +130,18 @@ impl From for DictionaryArray { panic!("DictionaryArray's data type must match.") }; // create a zero-copy of the keys' data - let keys = PrimitiveArray::::from(ArrayData::new( - T::DATA_TYPE, - data.len(), - Some(data.null_count()), - data.null_buffer().cloned(), - data.offset(), - data.buffers().to_vec(), - vec![], - )); + let keys_data = unsafe { + ArrayData::new_unchecked( + T::DATA_TYPE, + data.len(), + Some(data.null_count()), + data.null_buffer().cloned(), + data.offset(), + data.buffers().to_vec(), + vec![], + ) + }; + let keys = PrimitiveArray::::from(keys_data); let values = make_array(data.child_data()[0].clone()); Self { data, diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 5777a0304844..4ee055ad39c8 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -124,14 +124,10 @@ impl PrimitiveArray { /// Creates a PrimitiveArray based on an iterator of values without nulls pub fn from_iter_values>(iter: I) -> Self { let val_buf: Buffer = iter.into_iter().collect(); - let data = ArrayData::new( - T::DATA_TYPE, + let data = ArrayData::new_primitive::( val_buf.len() / mem::size_of::<::Native>(), None, - None, - 0, - vec![val_buf], - vec![], + val_buf, ); PrimitiveArray::from(data) } @@ -140,14 +136,10 @@ impl PrimitiveArray { pub fn from_value(value: T::Native, count: usize) -> Self { // # Safety: length is known let val_buf = unsafe { Buffer::from_trusted_len_iter((0..count).map(|_| value)) }; - let data = ArrayData::new( - T::DATA_TYPE, + let data = ArrayData::new_primitive::( val_buf.len() / mem::size_of::<::Native>(), None, - None, - 0, - vec![val_buf], - vec![], + val_buf, ); PrimitiveArray::from(data) } @@ -338,14 +330,10 @@ impl::Native }) .collect(); - let data = ArrayData::new( - T::DATA_TYPE, + let data = ArrayData::new_primitive::( null_buf.len(), - None, Some(null_buf.into()), - 0, - vec![buffer], - vec![], + buffer, ); PrimitiveArray::from(data) } @@ -369,7 +357,7 @@ impl PrimitiveArray { let (null, buffer) = trusted_len_unzip(iterator); let data = - ArrayData::new(T::DATA_TYPE, len, None, Some(null), 0, vec![buffer], vec![]); + ArrayData::new_primitive::(len, Some(null), buffer); PrimitiveArray::from(data) } } diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index cb389cacc7f6..45c058c8eea3 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -21,7 +21,7 @@ use std::mem; use std::sync::Arc; -use crate::datatypes::{DataType, IntervalUnit}; +use crate::datatypes::{DataType, IntervalUnit, ArrowPrimitiveType}; use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; use crate::{ buffer::{Buffer, MutableBuffer}, @@ -238,8 +238,57 @@ pub struct ArrayData { pub type ArrayDataRef = Arc; +fn validate_null_bitmap(len: usize, null_bit_buffer: Option) -> (usize, Option) { + let null_count = count_nulls(null_bit_buffer.as_ref(), 0, len); + match null_bit_buffer { + Some(b) => { + assert!(b.len()*8 >= len); + (null_count, Some(Bitmap::from(b))) + }, + None => (0_usize, None), + } +} + impl ArrayData { - pub fn new( + pub fn new_primitive( + len: usize, + null_bit_buffer: Option, + buffer: Buffer, + ) -> Self { + assert!(buffer.len() >= len*T::get_byte_width()); + let (null_count, null_bitmap) = validate_null_bitmap(len, null_bit_buffer); + + Self { + data_type: T::DATA_TYPE, + len, + null_count, + offset: 0, + buffers: vec![buffer], + child_data: vec![], + null_bitmap, + } + } + + pub fn new_boolean( + len: usize, + null_bit_buffer: Option, + buffer: Buffer, + ) -> Self { + assert!(buffer.len()*8 >= len); + let (null_count, null_bitmap) = validate_null_bitmap(len, null_bit_buffer); + + Self { + data_type: DataType::Boolean, + len, + null_count, + offset: 0, + buffers: vec![buffer], + child_data: vec![], + null_bitmap, + } + } + + pub unsafe fn new_unchecked( data_type: DataType, len: usize, null_count: Option, @@ -485,7 +534,9 @@ impl ArrayData { DataType::Float16 => unreachable!(), }; - Self::new(data_type.clone(), 0, Some(0), None, 0, buffers, child_data) + unsafe { + Self::new_unchecked(data_type.clone(), 0, Some(0), None, 0, buffers, child_data) + } } } @@ -565,15 +616,18 @@ impl ArrayDataBuilder { } pub fn build(self) -> ArrayData { - ArrayData::new( - self.data_type, - self.len, - self.null_count, - self.null_bit_buffer, - self.offset, - self.buffers, - self.child_data, - ) + // TODO: this should validate the ArrayData before returning + unsafe { + ArrayData::new_unchecked( + self.data_type, + self.len, + self.null_count, + self.null_bit_buffer, + self.offset, + self.buffers, + self.child_data, + ) + } } } @@ -582,12 +636,14 @@ mod tests { use super::*; use crate::buffer::Buffer; + use crate::datatypes::Int32Type; use crate::util::bit_util; #[test] fn test_new() { - let arr_data = - ArrayData::new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]); + let arr_data = unsafe { + ArrayData::new_unchecked(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]) + }; assert_eq!(10, arr_data.len()); assert_eq!(1, arr_data.null_count()); assert_eq!(2, arr_data.offset()); @@ -597,14 +653,10 @@ mod tests { #[test] fn test_builder() { - let child_arr_data = ArrayData::new( - DataType::Int32, + let child_arr_data = ArrayData::new_primitive::( 5, - Some(0), None, - 0, - vec![Buffer::from_slice_ref(&[1i32, 2, 3, 4, 5])], - vec![], + Buffer::from_slice_ref(&[1i32, 2, 3, 4, 5]), ); let v = vec![0, 1, 2, 3]; let b1 = Buffer::from(&v[..]); diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 69092c1af55d..7b658db51948 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -1150,15 +1150,17 @@ mod tests { ]); let list_value_offsets = Buffer::from_slice_ref(&[0i32, 3, 5, 11, 13, 13, 15, 15, 17]); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 8, - None, - None, - 0, - vec![list_value_offsets], - vec![expected_int_array.data().clone()], - ); + let expected_list_data = unsafe { + ArrayData::new_unchecked( + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + 8, + None, + None, + 0, + vec![list_value_offsets], + vec![expected_int_array.data().clone()], + ) + }; assert_eq!(finished, expected_list_data); Ok(()) @@ -1231,15 +1233,17 @@ mod tests { ]); let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23]); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Int64, true))), - 12, - None, - Some(Buffer::from(&[0b11011011, 0b1110])), - 0, - vec![list_value_offsets], - vec![expected_int_array.data().clone()], - ); + let expected_list_data = unsafe { + ArrayData::new_unchecked( + DataType::List(Box::new(Field::new("item", DataType::Int64, true))), + 12, + None, + Some(Buffer::from(&[0b11011011, 0b1110])), + 0, + vec![list_value_offsets], + vec![expected_int_array.data().clone()], + ) + }; assert_eq!(result, expected_list_data); Ok(()) @@ -1302,15 +1306,17 @@ mod tests { // extend b[0..0] ]); let list_value_offsets = Buffer::from_slice_ref(&[0, 3, 5, 6, 9, 10, 13]); - let expected_list_data = ArrayData::new( - DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), - 6, - None, - None, - 0, - vec![list_value_offsets], - vec![expected_string_array.data().clone()], - ); + let expected_list_data = unsafe { + ArrayData::new_unchecked( + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + 6, + None, + None, + 0, + vec![list_value_offsets], + vec![expected_string_array.data().clone()], + ) + }; assert_eq!(result, expected_list_data); Ok(()) } diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index b9596ee8cbd6..4a5476d8b7f7 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -182,14 +182,10 @@ where // `values` is an iterator with a known size. let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - let data = ArrayData::new( - T::DATA_TYPE, + let data = ArrayData::new_primitive::( left.len(), - None, null_bit_buffer, - 0, - vec![buffer], - vec![], + buffer, ); Ok(PrimitiveArray::::from(data)) } @@ -250,14 +246,10 @@ where unsafe { Buffer::try_from_trusted_len_iter(values) } }?; - let data = ArrayData::new( - T::DATA_TYPE, + let data = ArrayData::new_primitive::( left.len(), - None, null_bit_buffer, - 0, - vec![buffer], - vec![], + buffer, ); Ok(PrimitiveArray::::from(data)) } @@ -318,14 +310,10 @@ where unsafe { Buffer::try_from_trusted_len_iter(values) } }?; - let data = ArrayData::new( - T::DATA_TYPE, + let data = ArrayData::new_primitive::( left.len(), - None, null_bit_buffer, - 0, - vec![buffer], - vec![], + buffer, ); Ok(PrimitiveArray::::from(data)) } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index d7beae605993..28b2f8e12f00 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -26,17 +26,13 @@ fn into_primitive_array_data( array: &PrimitiveArray, buffer: Buffer, ) -> ArrayData { - ArrayData::new( - O::DATA_TYPE, + ArrayData::new_primitive::( array.len(), - None, array .data_ref() .null_buffer() .map(|b| b.bit_slice(array.offset(), array.len())), - 0, - vec![buffer], - vec![], + buffer, ) } diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index fcd1fb014b32..9df7df931df2 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -29,7 +29,7 @@ use crate::buffer::{ buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer, }; use crate::compute::util::combine_option_bitmap; -use crate::datatypes::{ArrowNumericType, DataType}; +use crate::datatypes::ArrowNumericType; use crate::error::{ArrowError, Result}; use crate::util::bit_util::{ceil, round_upto_multiple_of_64}; use core::iter; @@ -159,14 +159,10 @@ where let bool_buffer: Buffer = value_buffer.into(); let bool_valid_buffer: Buffer = valid_buffer.into(); - let array_data = ArrayData::new( - DataType::Boolean, + let array_data = ArrayData::new_boolean( len, - None, Some(bool_valid_buffer), - left_offset, - vec![bool_buffer], - vec![], + bool_buffer, ); Ok(BooleanArray::from(array_data)) @@ -200,14 +196,10 @@ where let values = op(left_buffer, left_offset, right_buffer, right_offset, len); - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( len, - None, null_bit_buffer, - 0, - vec![values], - vec![], + values, ); Ok(BooleanArray::from(data)) } @@ -380,14 +372,10 @@ pub fn not(left: &BooleanArray) -> Result { let values = buffer_unary_not(&data.buffers()[0], left_offset, len); - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( len, - None, null_bit_buffer, - 0, - vec![values], - vec![], + values, ); Ok(BooleanArray::from(data)) } @@ -419,7 +407,7 @@ pub fn is_null(input: &dyn Array) -> Result { }; let data = - ArrayData::new(DataType::Boolean, len, None, None, 0, vec![output], vec![]); + ArrayData::new_boolean(len, None, output); Ok(BooleanArray::from(data)) } @@ -452,8 +440,7 @@ pub fn is_not_null(input: &dyn Array) -> Result { Some(buffer) => buffer.bit_slice(input.offset(), len), }; - let data = - ArrayData::new(DataType::Boolean, len, None, None, 0, vec![output], vec![]); + let data = ArrayData::new_boolean(len, None, output); Ok(BooleanArray::from(data)) } @@ -524,27 +511,20 @@ where // Align/shift left data on offset as needed, since new bitmaps are shifted and aligned to 0 already // NOTE: this probably only works for primitive arrays. - let data_buffers = if left.offset() == 0 { - left_data.buffers().to_vec() + let data_buffer = if left.offset() == 0 { + left_data.buffers()[0].clone() } else { // Shift each data buffer by type's bit_width * offset. left_data - .buffers() - .iter() - .map(|buf| buf.slice(left.offset() * T::get_byte_width())) - .collect::>() + .buffers()[0].slice(left.offset() * T::get_byte_width()) }; // Construct new array with same values but modified null bitmap // TODO: shift data buffer as needed - let data = ArrayData::new( - T::DATA_TYPE, + let data = ArrayData::new_primitive::( left.len(), - None, // force new to compute the number of null bits modified_null_buffer, - 0, // No need for offset since left data has been shifted - data_buffers, - left_data.child_data().to_vec(), + data_buffer ); Ok(PrimitiveArray::::from(data)) } diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 593adecc381c..0ea3307ca333 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -691,47 +691,59 @@ pub fn cast_with_options( // end numeric casts // temporal casts - (Int32, Date32) => cast_array_data::(array, to_type.clone()), + (Int32, Date32) => unsafe { + cast_array_data::(array, to_type.clone()) + }, (Int32, Date64) => cast_with_options( &cast_with_options(array, &DataType::Date32, cast_options)?, &DataType::Date64, cast_options, ), - (Int32, Time32(TimeUnit::Second)) => { + (Int32, Time32(TimeUnit::Second)) => unsafe { cast_array_data::(array, to_type.clone()) } - (Int32, Time32(TimeUnit::Millisecond)) => { + (Int32, Time32(TimeUnit::Millisecond)) => unsafe { cast_array_data::(array, to_type.clone()) } // No support for microsecond/nanosecond with i32 - (Date32, Int32) => cast_array_data::(array, to_type.clone()), + (Date32, Int32) => unsafe { + cast_array_data::(array, to_type.clone()) + }, (Date32, Int64) => cast_with_options( &cast_with_options(array, &DataType::Int32, cast_options)?, &DataType::Int64, cast_options, ), - (Time32(_), Int32) => cast_array_data::(array, to_type.clone()), - (Int64, Date64) => cast_array_data::(array, to_type.clone()), + (Time32(_), Int32) => unsafe { + cast_array_data::(array, to_type.clone()) + }, + (Int64, Date64) => unsafe { + cast_array_data::(array, to_type.clone()) + }, (Int64, Date32) => cast_with_options( &cast_with_options(array, &DataType::Int32, cast_options)?, &DataType::Date32, cast_options, ), // No support for second/milliseconds with i64 - (Int64, Time64(TimeUnit::Microsecond)) => { + (Int64, Time64(TimeUnit::Microsecond)) => unsafe { cast_array_data::(array, to_type.clone()) } - (Int64, Time64(TimeUnit::Nanosecond)) => { + (Int64, Time64(TimeUnit::Nanosecond)) => unsafe { cast_array_data::(array, to_type.clone()) } - (Date64, Int64) => cast_array_data::(array, to_type.clone()), + (Date64, Int64) => unsafe { + cast_array_data::(array, to_type.clone()) + }, (Date64, Int32) => cast_with_options( &cast_with_options(array, &DataType::Int64, cast_options)?, &DataType::Int32, cast_options, ), - (Time64(_), Int64) => cast_array_data::(array, to_type.clone()), + (Time64(_), Int64) => unsafe { + cast_array_data::(array, to_type.clone()) + }, (Date32, Date64) => { let date_array = array.as_any().downcast_ref::().unwrap(); @@ -783,14 +795,18 @@ pub fn cast_with_options( let array_ref = Arc::new(converted) as ArrayRef; use TimeUnit::*; match to_unit { - Microsecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Nanosecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), + Microsecond => unsafe { + cast_array_data::( + &array_ref, + to_type.clone(), + ) + }, + Nanosecond => unsafe { + cast_array_data::( + &array_ref, + to_type.clone(), + ) + }, _ => unreachable!("array type not supported"), } } @@ -835,8 +851,10 @@ pub fn cast_with_options( _ => unreachable!("array type not supported"), } } - (Timestamp(_, _), Int64) => cast_array_data::(array, to_type.clone()), - (Int64, Timestamp(to_unit, _)) => { + (Timestamp(_, _), Int64) => unsafe { + cast_array_data::(array, to_type.clone()) + } + (Int64, Timestamp(to_unit, _)) => unsafe { use TimeUnit::*; match to_unit { Second => cast_array_data::(array, to_type.clone()), @@ -871,21 +889,27 @@ pub fn cast_with_options( let array_ref = Arc::new(converted) as ArrayRef; use TimeUnit::*; match to_unit { - Second => { + Second => unsafe { cast_array_data::(&array_ref, to_type.clone()) } - Millisecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Microsecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), - Nanosecond => cast_array_data::( - &array_ref, - to_type.clone(), - ), + Millisecond => unsafe { + cast_array_data::( + &array_ref, + to_type.clone(), + ) + }, + Microsecond => unsafe { + cast_array_data::( + &array_ref, + to_type.clone(), + ) + }, + Nanosecond => unsafe { + cast_array_data::( + &array_ref, + to_type.clone(), + ) + }, } } (Timestamp(from_unit, _), Date32) => { @@ -918,7 +942,7 @@ pub fn cast_with_options( &Date64Array::from(vec![from_size / to_size; array.len()]), )?) as ArrayRef) } - std::cmp::Ordering::Equal => { + std::cmp::Ordering::Equal => unsafe { cast_array_data::(array, to_type.clone()) } std::cmp::Ordering::Greater => { @@ -931,7 +955,7 @@ pub fn cast_with_options( } } // date64 to timestamp might not make sense, - (Int64, Duration(to_unit)) => { + (Int64, Duration(to_unit)) => unsafe { use TimeUnit::*; match to_unit { Second => cast_array_data::(array, to_type.clone()), @@ -985,11 +1009,12 @@ const EPOCH_DAYS_FROM_CE: i32 = 719_163; /// Arrays should have the same primitive data type, otherwise this should fail. /// We do not perform this check on primitive data types as we only use this /// function internally, where it is guaranteed to be infallible. -fn cast_array_data(array: &ArrayRef, to_type: DataType) -> Result +// Safety: from and to data types must have the same layout +unsafe fn cast_array_data(array: &ArrayRef, to_type: DataType) -> Result where TO: ArrowNumericType, { - let data = ArrayData::new( + let data = ArrayData::new_unchecked( to_type, array.len(), Some(array.null_count()), @@ -1432,19 +1457,21 @@ fn dictionary_cast( } // keys are data, child_data is values (dictionary) - let data = ArrayData::new( - to_type.clone(), - cast_keys.len(), - Some(cast_keys.null_count()), - cast_keys - .data() - .null_bitmap() - .clone() - .map(|bitmap| bitmap.bits), - cast_keys.data().offset(), - cast_keys.data().buffers().to_vec(), - vec![cast_values.data().clone()], - ); + let data = unsafe { + ArrayData::new_unchecked( + to_type.clone(), + cast_keys.len(), + Some(cast_keys.null_count()), + cast_keys + .data() + .null_bitmap() + .clone() + .map(|bitmap| bitmap.bits), + cast_keys.data().offset(), + cast_keys.data().buffers().to_vec(), + vec![cast_values.data().clone()], + ) + }; // create the appropriate array type let new_array: ArrayRef = match **to_index_type { @@ -1648,19 +1675,21 @@ fn cast_primitive_to_list( ) }; - let list_data = ArrayData::new( - to_type.clone(), - array.len(), - Some(cast_array.null_count()), - cast_array - .data() - .null_bitmap() - .clone() - .map(|bitmap| bitmap.bits), - 0, - vec![offsets.into()], - vec![cast_array.data().clone()], - ); + let list_data = unsafe { + ArrayData::new_unchecked( + to_type.clone(), + array.len(), + Some(cast_array.null_count()), + cast_array + .data() + .null_bitmap() + .clone() + .map(|bitmap| bitmap.bits), + 0, + vec![offsets.into()], + vec![cast_array.data().clone()], + ) + }; let list_array = Arc::new(GenericListArray::::from(list_data)) as ArrayRef; @@ -1677,20 +1706,22 @@ fn cast_list_inner( let data = array.data_ref(); let underlying_array = make_array(data.child_data()[0].clone()); let cast_array = cast_with_options(&underlying_array, to.data_type(), cast_options)?; - let array_data = ArrayData::new( - to_type.clone(), - array.len(), - Some(cast_array.null_count()), - cast_array - .data() - .null_bitmap() - .clone() - .map(|bitmap| bitmap.bits), - array.offset(), - // reuse offset buffer - data.buffers().to_vec(), - vec![cast_array.data().clone()], - ); + let array_data = unsafe { + ArrayData::new_unchecked( + to_type.clone(), + array.len(), + Some(cast_array.null_count()), + cast_array + .data() + .null_bitmap() + .clone() + .map(|bitmap| bitmap.bits), + array.offset(), + // reuse offset buffer + data.buffers().to_vec(), + vec![cast_array.data().clone()], + ) + }; let list = GenericListArray::::from(array_data); Ok(Arc::new(list) as ArrayRef) } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 49d0aca55093..c77c4b5c0705 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -53,14 +53,10 @@ macro_rules! compare_op { // same size as $left.len() and $right.len() let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( $left.len(), - None, null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], + Buffer::from(buffer), ); Ok(BooleanArray::from(data)) }}; @@ -108,14 +104,10 @@ macro_rules! compare_op_primitive { *last |= if $op(lhs, rhs) { 1 << i } else { 0 }; }); }; - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( $left.len(), - None, null_bit_buffer, - 0, - vec![Buffer::from(values)], - vec![], + Buffer::from(values), ); Ok(BooleanArray::from(data)) }}; @@ -135,14 +127,10 @@ macro_rules! compare_op_scalar { // same as $left.len() let buffer = unsafe { MutableBuffer::from_trusted_len_iter_bool(comparison) }; - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( $left.len(), - None, null_bit_buffer, - 0, - vec![Buffer::from(buffer)], - vec![], + Buffer::from(buffer), ); Ok(BooleanArray::from(data)) }}; @@ -175,14 +163,10 @@ macro_rules! compare_op_scalar_primitive { }); }; - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( $left.len(), - None, null_bit_buffer, - 0, - vec![Buffer::from(values)], - vec![], + Buffer::from(values), ); Ok(BooleanArray::from(data)) }}; @@ -270,14 +254,10 @@ pub fn like_utf8( result.append(re.is_match(haystack)); } - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( left.len(), - None, null_bit_buffer, - 0, - vec![result.finish()], - vec![], + result.finish(), ); Ok(BooleanArray::from(data)) } @@ -340,14 +320,10 @@ pub fn like_utf8_scalar( } }; - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( left.len(), - None, null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], + bool_buf.into(), ); Ok(BooleanArray::from(data)) } @@ -392,14 +368,10 @@ pub fn nlike_utf8( result.append(!re.is_match(haystack)); } - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( left.len(), - None, null_bit_buffer, - 0, - vec![result.finish()], - vec![], + result.finish(), ); Ok(BooleanArray::from(data)) } @@ -445,14 +417,10 @@ pub fn nlike_utf8_scalar( } } - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( left.len(), - None, null_bit_buffer, - 0, - vec![result.finish()], - vec![], + result.finish(), ); Ok(BooleanArray::from(data)) } @@ -530,14 +498,10 @@ pub fn regexp_is_match_utf8( }) .collect::>>()?; - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( array.len(), - None, null_bit_buffer, - 0, - vec![result.finish()], - vec![], + result.finish(), ); Ok(BooleanArray::from(data)) } @@ -575,14 +539,10 @@ pub fn regexp_is_match_utf8_scalar( } } - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( array.len(), - None, null_bit_buffer, - 0, - vec![result.finish()], - vec![], + result.finish(), ); Ok(BooleanArray::from(data)) } @@ -1046,14 +1006,10 @@ where } } - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( left.len(), None, - None, - 0, - vec![bool_buf.into()], - vec![], + bool_buf.into(), ); Ok(BooleanArray::from(data)) } @@ -1104,14 +1060,10 @@ where } } - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( left.len(), None, - None, - 0, - vec![bool_buf.into()], - vec![], + bool_buf.into(), ); Ok(BooleanArray::from(data)) } diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs index fb76d000076e..806fa63b01d5 100644 --- a/arrow/src/compute/kernels/length.rs +++ b/arrow/src/compute/kernels/length.rs @@ -27,13 +27,15 @@ use crate::{ error::{ArrowError, Result}, }; -fn unary_offsets_string( +fn unary_offsets_string( array: &GenericStringArray, - data_type: DataType, + _data_type: DataType, op: F, ) -> ArrayRef where + T: ArrowPrimitiveType, O: StringOffsetSizeTrait + ArrowNativeType, + T::Native: StringOffsetSizeTrait, F: Fn(O) -> O, { // note: offsets are stored as u8, but they can be interpreted as OffsetSize @@ -56,14 +58,10 @@ where .null_buffer() .map(|b| b.bit_slice(array.offset(), array.len())); - let data = ArrayData::new( - data_type, + let data = ArrayData::new_primitive::( array.len(), - None, null_bit_buffer, - 0, - vec![buffer], - vec![], + buffer, ); make_array(data) } @@ -78,7 +76,7 @@ where .as_any() .downcast_ref::>() .unwrap(); - unary_offsets_string::(array, T::DATA_TYPE, |x| x) + unary_offsets_string::(array, T::DATA_TYPE, |x| x) } fn bit_length_impl( @@ -92,7 +90,7 @@ where .downcast_ref::>() .unwrap(); let bits_in_bytes = O::from_usize(8).unwrap(); - unary_offsets_string::(array, T::DATA_TYPE, |x| x * bits_in_bytes) + unary_offsets_string::(array, T::DATA_TYPE, |x| x * bits_in_bytes) } /// Returns an array of Int32/Int64 denoting the number of bytes in each string in the array. diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 6f42be34aa44..d93fc908b1c3 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -488,14 +488,10 @@ fn sort_boolean( } } - let result_data = ArrayData::new( - DataType::UInt32, + let result_data = ArrayData::new_primitive::( len, - Some(0), None, - 0, - vec![result.into()], - vec![], + result.into(), ); UInt32Array::from(result_data) @@ -574,14 +570,10 @@ where } } - let result_data = ArrayData::new( - DataType::UInt32, + let result_data = ArrayData::new_primitive::( len, - Some(0), None, - 0, - vec![result.into()], - vec![], + result.into(), ); UInt32Array::from(result_data) diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index d4ea6616c648..01fdf640bdae 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -74,18 +74,20 @@ fn generic_substring( new_values.extend_from_slice(&data[start..start + length]); }); - let data = ArrayData::new( - ::DATA_TYPE, - array.len(), - None, - null_bit_buffer, - 0, - vec![ - Buffer::from_slice_ref(&new_offsets), - Buffer::from_slice_ref(&new_values), - ], - vec![], - ); + let data = unsafe { + ArrayData::new_unchecked( + ::DATA_TYPE, + array.len(), + None, + null_bit_buffer, + 0, + vec![ + Buffer::from_slice_ref(&new_offsets), + Buffer::from_slice_ref(&new_values), + ], + vec![], + ) + }; Ok(make_array(data)) } diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 71479723e022..b167ab8bc203 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -523,14 +523,10 @@ where } }; - let data = ArrayData::new( - T::DATA_TYPE, + let data = ArrayData::new_primitive::( indices.len(), - None, nulls, - 0, - vec![buffer], - vec![], + buffer, ); Ok(PrimitiveArray::::from(data)) } @@ -598,14 +594,10 @@ where }; } - let data = ArrayData::new( - DataType::Boolean, + let data = ArrayData::new_boolean( indices.len(), - None, nulls, - 0, - vec![val_buf.into()], - vec![], + val_buf.into(), ); Ok(BooleanArray::from(data)) } @@ -884,15 +876,17 @@ where let new_keys = take_primitive::(values.keys(), indices)?; let new_keys_data = new_keys.data_ref(); - let data = ArrayData::new( - values.data_type().clone(), - new_keys.len(), - Some(new_keys_data.null_count()), - new_keys_data.null_buffer().cloned(), - 0, - new_keys_data.buffers().to_vec(), - values.data().child_data().to_vec(), - ); + let data = unsafe { + ArrayData::new_unchecked( + values.data_type().clone(), + new_keys.len(), + Some(new_keys_data.null_count()), + new_keys_data.null_buffer().cloned(), + 0, + new_keys_data.buffers().to_vec(), + values.data().child_data().to_vec(), + ) + }; Ok(DictionaryArray::::from(data)) } diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index 6d4d0e40a9b4..eb3baeaab3b9 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -184,16 +184,20 @@ pub(super) mod tests { offset: usize, null_bit_buffer: Option, ) -> Arc { - // empty vec for buffers and children is not really correct, but for these tests we only care about the null bitmap - Arc::new(ArrayData::new( - DataType::UInt8, - len, - None, - null_bit_buffer, - offset, - vec![], - vec![], - )) + if let Some(ref b) = null_bit_buffer { + assert!(b.len()*8 >= offset+len); + } + unsafe { + Arc::new(ArrayData::new_unchecked( + DataType::UInt8, + len, + None, + null_bit_buffer, + offset, + vec![Buffer::from(vec![0; offset + len])], + vec![], + )) + } } #[test] diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 1cbec341cf37..7a0c3a9230bf 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -477,6 +477,12 @@ impl DataType { ) } + /// Returns true if this type is numeric: (UInt*, Unit*, or Float*). + pub fn is_primitive(t: &DataType) -> bool { + use DataType::*; + DataType::is_numeric(t) || matches!(t, Time32(_) | Time64(_) | Date32 | Date64 | Timestamp(_, _) | Duration(_) | Interval(_)) + } + /// Compares the datatype with another, ignoring nested field names /// and metadata. pub(crate) fn equals_datatype(&self, other: &DataType) -> bool { diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 36d7f26f3f20..c6b3ad4ba2c5 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -514,15 +514,17 @@ pub trait ArrowArrayRef { .map(|d| d.unwrap()) .collect(); - Ok(ArrayData::new( - data_type, - len, - Some(null_count), - null_bit_buffer, - offset, - buffers, - child_data, - )) + unsafe { + Ok(ArrayData::new_unchecked( + data_type, + len, + Some(null_count), + null_bit_buffer, + offset, + buffers, + child_data, + )) + } } /// returns all buffers, as organized by Rust (i.e. null buffer is skipped) diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index 9592b59b2732..9814a3fb2f14 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -1388,26 +1388,28 @@ impl Decoder { &[], )?; - Ok(make_array(ArrayData::new( - map_type.clone(), - rows_len, - None, - Some(list_bitmap.into()), - 0, - vec![Buffer::from_slice_ref(&list_offsets)], - vec![ArrayData::new( - struct_field.data_type().clone(), - struct_children[0].len(), - None, + unsafe { + Ok(make_array(ArrayData::new_unchecked( + map_type.clone(), + rows_len, None, + Some(list_bitmap.into()), 0, - vec![], - struct_children - .into_iter() - .map(|array| array.data().clone()) - .collect(), - )], - ))) + vec![Buffer::from_slice_ref(&list_offsets)], + vec![ArrayData::new_unchecked( + struct_field.data_type().clone(), + struct_children[0].len(), + None, + None, + 0, + vec![], + struct_children + .into_iter() + .map(|array| array.data().clone()) + .collect(), + )], + ))) + } } #[inline(always)] diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 08624ae86ee8..35b65ef303db 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -192,15 +192,17 @@ fn create_random_list_array( true => Some(create_random_null_buffer(size, null_density)), false => None, }; - let list_data = ArrayData::new( - field.data_type().clone(), - size, - None, - null_buffer, - 0, - vec![offsets], - vec![child_data.clone()], - ); + let list_data = unsafe { + ArrayData::new_unchecked( + field.data_type().clone(), + size, + None, + null_buffer, + 0, + vec![offsets], + vec![child_data.clone()], + ) + }; Ok(make_array(list_data)) }