Skip to content

Commit

Permalink
fix: remove offsetbuffer
Browse files Browse the repository at this point in the history
  • Loading branch information
Kikkon committed Apr 6, 2024
1 parent d8aba1b commit 955eb2b
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 164 deletions.
111 changes: 70 additions & 41 deletions arrow-array/src/array/list_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
// specific language governing permissions and limitations
// under the License.

use crate::array::{get_offsets, get_sizes, make_array, print_long_array};
use crate::array::{get_view_offsets, get_view_sizes, make_array, print_long_array};
use crate::builder::{GenericListViewBuilder, PrimitiveBuilder};
use crate::iterator::GenericListViewArrayIter;
use crate::{
new_empty_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, FixedSizeListArray,
OffsetSizeTrait,
};
use arrow_buffer::{NullBuffer, OffsetBuffer, SizeBuffer};
use arrow_buffer::{NullBuffer, ScalarBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, FieldRef};
use std::any::Any;
Expand All @@ -48,8 +48,8 @@ pub struct GenericListViewArray<OffsetSize: OffsetSizeTrait> {
data_type: DataType,
nulls: Option<NullBuffer>,
values: ArrayRef,
value_offsets: OffsetBuffer<OffsetSize>,
value_sizes: SizeBuffer<OffsetSize>,
value_offsets: ScalarBuffer<OffsetSize>,
value_sizes: ScalarBuffer<OffsetSize>,
len: usize,
}

Expand All @@ -76,35 +76,47 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
DataType::ListView
};

/// Returns the data type of the list view array
/// Create a new [`GenericListViewArray`] from the provided parts
///
/// # Errors
///
/// Errors if
///
/// * `offsets.len() != sizes.len()`
/// * `offsets.len() != nulls.len()`
/// * `offsets.last() > values.len()`
/// * `!field.is_nullable() && values.is_nullable()`
/// * `field.data_type() != values.data_type()`
/// * `0 <= offsets[i] <= length of the child array`
/// * `0 <= offsets[i] + size[i] <= length of the child array`
pub fn try_new(
field: FieldRef,
offsets: OffsetBuffer<OffsetSize>,
sizes: SizeBuffer<OffsetSize>,
offsets: ScalarBuffer<OffsetSize>,
sizes: ScalarBuffer<OffsetSize>,
values: ArrayRef,
nulls: Option<NullBuffer>,
) -> Result<Self, ArrowError> {
let mut len = 0usize;
for i in 0..offsets.len() {
let length = values.len();
let offset = offsets[i].as_usize();
let size = sizes[i].as_usize();
if offset > length {
return Err(ArrowError::InvalidArgumentError(format!(
"Invalid offset value for {}ListViewArray, offset: {}, length: {}",
OffsetSize::PREFIX,
offset,
length
)));
}
if offset + size > values.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Invalid offset and size values for {}ListViewArray, offset: {}, size: {}, values.len(): {}",
OffsetSize::PREFIX, offset, size, values.len()
)));
}
len = len.add(size);
}

let len = offsets.len();
if len != sizes.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}",
OffsetSize::PREFIX, len, sizes.len()
)));
}

if let Some(n) = nulls.as_ref() {
if n.len() != len {
return Err(ArrowError::InvalidArgumentError(format!(
Expand All @@ -114,6 +126,13 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
)));
}
}
if len != sizes.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Length of offsets buffer and sizes buffer must be equal for {}ListViewArray, got {} and {}",
OffsetSize::PREFIX, len, sizes.len()
)));
}

if !field.is_nullable() && values.is_nullable() {
return Err(ArrowError::InvalidArgumentError(format!(
"Non-nullable field of {}ListViewArray {:?} cannot contain nulls",
Expand Down Expand Up @@ -149,8 +168,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
/// Panics if [`Self::try_new`] returns an error
pub fn new(
field: FieldRef,
offsets: OffsetBuffer<OffsetSize>,
sizes: SizeBuffer<OffsetSize>,
offsets: ScalarBuffer<OffsetSize>,
sizes: ScalarBuffer<OffsetSize>,
values: ArrayRef,
nulls: Option<NullBuffer>,
) -> Self {
Expand All @@ -163,8 +182,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
Self {
data_type: Self::DATA_TYPE_CONSTRUCTOR(field),
nulls: Some(NullBuffer::new_null(len)),
value_offsets: OffsetBuffer::new_zeroed(len),
value_sizes: SizeBuffer::new_zeroed(len),
value_offsets: ScalarBuffer::new_zeroed(len),
value_sizes: ScalarBuffer::new_zeroed(len),
values,
len: 0usize,
}
Expand All @@ -175,8 +194,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
self,
) -> (
FieldRef,
OffsetBuffer<OffsetSize>,
SizeBuffer<OffsetSize>,
ScalarBuffer<OffsetSize>,
ScalarBuffer<OffsetSize>,
ArrayRef,
Option<NullBuffer>,
) {
Expand All @@ -195,10 +214,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {

/// Returns a reference to the offsets of this list
///
/// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`]
/// Unlike [`Self::value_offsets`] this returns the [`ScalarBuffer`]
/// allowing for zero-copy cloning
#[inline]
pub fn offsets(&self) -> &OffsetBuffer<OffsetSize> {
pub fn offsets(&self) -> &ScalarBuffer<OffsetSize> {
&self.value_offsets
}

Expand All @@ -213,7 +232,7 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
/// Unlike [`Self::value_sizes`] this returns the [`SizeBuffer`]
/// allowing for zero-copy cloning
#[inline]
pub fn sizes(&self) -> &SizeBuffer<OffsetSize> {
pub fn sizes(&self) -> &ScalarBuffer<OffsetSize> {
&self.value_sizes
}

Expand Down Expand Up @@ -364,7 +383,7 @@ impl<OffsetSize: OffsetSizeTrait> Array for GenericListViewArray<OffsetSize> {

fn get_buffer_memory_size(&self) -> usize {
let mut size = self.values.get_buffer_memory_size();
size += self.value_offsets.inner().inner().capacity();
size += self.value_offsets.inner().capacity();
if let Some(n) = self.nulls.as_ref() {
size += n.buffer().capacity();
}
Expand All @@ -373,7 +392,7 @@ impl<OffsetSize: OffsetSizeTrait> Array for GenericListViewArray<OffsetSize> {

fn get_array_memory_size(&self) -> usize {
let mut size = std::mem::size_of::<Self>() + self.values.get_array_memory_size();
size += self.value_offsets.inner().inner().capacity();
size += self.value_offsets.inner().capacity();
if let Some(n) = self.nulls.as_ref() {
size += n.buffer().capacity();
}
Expand All @@ -399,8 +418,8 @@ impl<OffsetSize: OffsetSizeTrait> From<GenericListViewArray<OffsetSize>> for Arr
.len(len)
.nulls(array.nulls)
.buffers(vec![
array.value_offsets.into_inner().into_inner(),
array.value_sizes.into_inner().into_inner(),
array.value_offsets.into_inner(),
array.value_sizes.into_inner(),
])
.child_data(vec![array.values.to_data()]);

Expand All @@ -421,9 +440,20 @@ impl<OffsetSize: OffsetSizeTrait> From<FixedSizeListArray> for GenericListViewAr
DataType::FixedSizeList(f, size) => (f, *size as usize),
_ => unreachable!(),
};

let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(value.len()));
let sizes = SizeBuffer::from_lengths(std::iter::repeat(size).take(value.len()));
let iter = std::iter::repeat(size).take(value.len());
let mut offsets = Vec::with_capacity(iter.size_hint().0);
offsets.push(OffsetSize::usize_as(0));
let mut acc = 0_usize;
let iter = std::iter::repeat(size).take(value.len());
let mut sizes = Vec::with_capacity(iter.size_hint().0);
for size in iter {
acc = acc.checked_add(size).expect("usize overflow");
offsets.push(OffsetSize::usize_as(acc));
sizes.push(OffsetSize::usize_as(size));
}
OffsetSize::from_usize(acc).expect("offset overflow");
let sizes = ScalarBuffer::from(sizes);
let offsets = ScalarBuffer::from(offsets);
Self {
data_type: Self::DATA_TYPE_CONSTRUCTOR(field.clone()),
nulls: value.nulls().cloned(),
Expand Down Expand Up @@ -472,8 +502,8 @@ impl<OffsetSize: OffsetSizeTrait> GenericListViewArray<OffsetSize> {
let values = make_array(values);
// SAFETY:
// ArrayData is valid, and verified type above
let value_offsets = unsafe { get_offsets(&data) };
let value_sizes = unsafe { get_sizes(&data) };
let value_offsets = get_view_offsets(&data);
let value_sizes = get_view_sizes(&data);

Ok(Self {
data_type: data.data_type().clone(),
Expand All @@ -500,9 +530,9 @@ mod tests {
fn create_from_buffers() -> ListViewArray {
// [[0, 1, 2], [3, 4, 5], [6, 7]]
let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7]);
let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 6]));
let offsets = ScalarBuffer::from(vec![0, 3, 6]);
let field = Arc::new(Field::new("item", DataType::Int32, true));
let sizes = SizeBuffer::new(ScalarBuffer::from(vec![3, 3, 2]));
let sizes = ScalarBuffer::from(vec![3, 3, 2]);

ListViewArray::new(field, offsets, sizes, Arc::new(values), None)
}
Expand Down Expand Up @@ -1133,8 +1163,8 @@ mod tests {

#[test]
fn test_try_new() {
let offsets = OffsetBuffer::new(vec![0, 1, 4, 5].into());
let sizes = SizeBuffer::new(vec![1, 3, 1, 0].into());
let offsets = ScalarBuffer::from(vec![0, 1, 4, 5]);
let sizes = ScalarBuffer::from(vec![1, 3, 1, 0]);
let values = Int32Array::new(vec![1, 2, 3, 4, 5].into(), None);
let values = Arc::new(values) as ArrayRef;

Expand All @@ -1157,8 +1187,8 @@ mod tests {
);

let nulls = NullBuffer::new_null(4);
let offsets = OffsetBuffer::new(vec![0, 1, 2, 3, 4].into());
let sizes = SizeBuffer::new(vec![1, 1, 1, 1, 0].into());
let offsets = ScalarBuffer::from(vec![0, 1, 2, 3, 4]);
let sizes = ScalarBuffer::from(vec![1, 1, 1, 1, 0]);
let err = LargeListViewArray::try_new(
field,
offsets.clone(),
Expand Down Expand Up @@ -1217,7 +1247,6 @@ mod tests {
builder.values().append_slice(&[4, 5, 6]);
builder.append(true);
let list: ListViewArray = builder.finish().into();

let values: Vec<_> = list
.iter()
.map(|x| x.map(|x| x.as_primitive::<Int32Type>().values().to_vec()))
Expand Down
29 changes: 19 additions & 10 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
mod binary_array;

use crate::types::*;
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer, SizeBuffer};
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow_data::ArrayData;
use arrow_schema::{DataType, IntervalUnit, TimeUnit};
use std::any::Any;
Expand Down Expand Up @@ -696,18 +696,27 @@ unsafe fn get_offsets<O: ArrowNativeType>(data: &ArrayData) -> OffsetBuffer<O> {
}
}

/// Helper function that gets size from an [`ArrayData`]
/// Helper function that gets list view offset from an [`ArrayData`]
///
/// # Safety
unsafe fn get_sizes<O: ArrowNativeType>(data: &ArrayData) -> SizeBuffer<O> {
///
/// ArrayData must contain a valid [`ScalarBuffer`] as its first buffer
fn get_view_offsets<O: ArrowNativeType>(data: &ArrayData) -> ScalarBuffer<O> {
match data.is_empty() && data.buffers()[0].is_empty() {
true => ScalarBuffer::new_empty(),
false => ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len() + 1),
}
}

/// Helper function that gets list view size from an [`ArrayData`]
///
/// # Safety
///
/// ArrayData must contain a valid [`ScalarBuffer`] as its second buffer
fn get_view_sizes<O: ArrowNativeType>(data: &ArrayData) -> ScalarBuffer<O> {
match data.is_empty() && data.buffers()[1].is_empty() {
true => SizeBuffer::new_empty(),
false => {
let buffer = ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len());
// Safety:
// ArrayData is valid
SizeBuffer::new(buffer)
}
true => ScalarBuffer::new_empty(),
false => ScalarBuffer::new(data.buffers()[1].clone(), data.offset(), data.len()),
}
}

Expand Down
10 changes: 5 additions & 5 deletions arrow-array/src/builder/generic_list_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use crate::builder::ArrayBuilder;
use crate::{ArrayRef, GenericListViewArray, OffsetSizeTrait};
use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, OffsetBuffer, SizeBuffer};
use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
use arrow_schema::{Field, FieldRef};
use std::any::Any;
use std::sync::Arc;
Expand Down Expand Up @@ -185,10 +185,10 @@ where

let offsets = self.offsets_builder.finish();
// Safety: Safe by construction
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
let offsets = ScalarBuffer::from(offsets);

let sizes = self.sizes_builder.finish();
let sizes = SizeBuffer::new(sizes.into());
let sizes = ScalarBuffer::from(sizes);

let field = match &self.field {
Some(f) => f.clone(),
Expand All @@ -204,10 +204,10 @@ where

let offsets = Buffer::from_slice_ref(self.offsets_builder.as_slice());
// Safety: safe by construction
let offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
let offsets = ScalarBuffer::from(offsets);

let sizes = Buffer::from_slice_ref(self.sizes_builder.as_slice());
let sizes = SizeBuffer::new(sizes.into());
let sizes = ScalarBuffer::from(sizes);

let field = match &self.field {
Some(f) => f.clone(),
Expand Down
2 changes: 0 additions & 2 deletions arrow-buffer/src/buffer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,5 @@ pub use boolean::*;
mod null;
pub use null::*;
mod run;
mod size;
pub use size::*;

pub use run::*;
13 changes: 13 additions & 0 deletions arrow-buffer/src/buffer/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,19 @@ impl<T: ArrowNativeType> ScalarBuffer<T> {
pub fn ptr_eq(&self, other: &Self) -> bool {
self.buffer.ptr_eq(&other.buffer)
}

/// Returns the length of this buffer in units of `T`
pub fn new_zeroed(len: usize) -> Self {
let len_bytes = len.checked_mul(std::mem::size_of::<T>()).expect("overflow");
let buffer = MutableBuffer::from_len_zeroed(len_bytes);
Self::from(buffer)
}

/// Create a new [`ScalarBuffer`] containing a single 0 value
pub fn new_empty() -> Self {
let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::<T>());
Self::from(buffer.into_buffer())
}
}

impl<T: ArrowNativeType> Deref for ScalarBuffer<T> {
Expand Down
Loading

0 comments on commit 955eb2b

Please sign in to comment.