From ca21bd7f06c88954e9c1d647c35413fec6121d22 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Thu, 23 Jan 2025 15:04:27 +0100 Subject: [PATCH] perf: Use BitmapBuilder in yet more places (#20868) --- crates/polars-arrow/src/bitmap/builder.rs | 5 +++++ .../polars-core/src/chunked_array/object/builder.rs | 10 +++++++--- .../src/dsl/function_expr/shift_and_fill.rs | 12 ++++++------ crates/polars-python/src/series/construction.rs | 6 +++--- crates/polars-row/src/decode.rs | 12 ++++++------ crates/polars-row/src/fixed/decimal.rs | 6 +++--- crates/polars-row/src/fixed/packed_u32.rs | 6 +++--- crates/polars-row/src/utils.rs | 8 ++++---- crates/polars-row/src/variable/no_order.rs | 6 +++--- crates/polars-row/src/variable/utf8.rs | 6 +++--- .../src/nodes/io_sources/parquet/row_group_decode.rs | 4 ++-- 11 files changed, 45 insertions(+), 36 deletions(-) diff --git a/crates/polars-arrow/src/bitmap/builder.rs b/crates/polars-arrow/src/bitmap/builder.rs index a4639e30adac..a8aa933fb1c3 100644 --- a/crates/polars-arrow/src/bitmap/builder.rs +++ b/crates/polars-arrow/src/bitmap/builder.rs @@ -24,6 +24,11 @@ impl BitmapBuilder { self.bit_len } + #[inline(always)] + pub fn is_empty(&self) -> bool { + self.bit_len == 0 + } + #[inline(always)] pub fn capacity(&self) -> usize { self.bit_cap diff --git a/crates/polars-core/src/chunked_array/object/builder.rs b/crates/polars-core/src/chunked_array/object/builder.rs index ca418a0ac92d..b6992aa591a2 100644 --- a/crates/polars-core/src/chunked_array/object/builder.rs +++ b/crates/polars-core/src/chunked_array/object/builder.rs @@ -145,13 +145,17 @@ where unsafe { ObjectChunked::new_with_dims(field, vec![arr], len, 0) } } - pub fn new_from_vec_and_validity(name: PlSmallStr, v: Vec, validity: Bitmap) -> Self { + pub fn new_from_vec_and_validity( + name: PlSmallStr, + v: Vec, + validity: Option, + ) -> Self { let field = Arc::new(Field::new(name, DataType::Object(T::type_name(), None))); let len = v.len(); - let null_count = validity.unset_bits(); + let null_count = validity.as_ref().map(|v| v.unset_bits()).unwrap_or(0); let arr = Box::new(ObjectArray { values: v.into(), - validity: Some(validity), + validity, }); unsafe { ObjectChunked::new_with_dims(field, vec![arr], len, null_count) } diff --git a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs index d0d27742ed54..3230e96a98d1 100644 --- a/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs +++ b/crates/polars-plan/src/dsl/function_expr/shift_and_fill.rs @@ -17,24 +17,24 @@ where feature = "dtype-categorical" ))] fn shift_and_fill_with_mask(s: &Column, n: i64, fill_value: &Column) -> PolarsResult { - use polars_core::export::arrow::array::BooleanArray; - use polars_core::export::arrow::bitmap::MutableBitmap; + use arrow::array::BooleanArray; + use arrow::bitmap::BitmapBuilder; let mask: BooleanChunked = if n > 0 { let len = s.len(); - let mut bits = MutableBitmap::with_capacity(s.len()); + let mut bits = BitmapBuilder::with_capacity(s.len()); bits.extend_constant(n as usize, false); bits.extend_constant(len.saturating_sub(n as usize), true); - let mask = BooleanArray::from_data_default(bits.into(), None); + let mask = BooleanArray::from_data_default(bits.freeze(), None); mask.into() } else { let length = s.len() as i64; // n is negative, so subtraction. let tipping_point = std::cmp::max(length + n, 0); - let mut bits = MutableBitmap::with_capacity(s.len()); + let mut bits = BitmapBuilder::with_capacity(s.len()); bits.extend_constant(tipping_point as usize, true); bits.extend_constant(-n as usize, false); - let mask = BooleanArray::from_data_default(bits.into(), None); + let mask = BooleanArray::from_data_default(bits.freeze(), None); mask.into() }; s.shift(n).zip_with_same_type(&mask, fill_value) diff --git a/crates/polars-python/src/series/construction.rs b/crates/polars-python/src/series/construction.rs index a15773a21e3f..91cc012fec1f 100644 --- a/crates/polars-python/src/series/construction.rs +++ b/crates/polars-python/src/series/construction.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use numpy::{Element, PyArray1, PyArrayMethods}; use polars::export::arrow; use polars::export::arrow::array::Array; -use polars::export::arrow::bitmap::MutableBitmap; +use polars::export::arrow::bitmap::BitmapBuilder; use polars::export::arrow::types::NativeType; use polars_core::prelude::*; use polars_core::utils::CustomIterTools; @@ -294,7 +294,7 @@ impl PySeries { pub fn new_object(py: Python, name: &str, values: Vec, _strict: bool) -> Self { #[cfg(feature = "object")] { - let mut validity = MutableBitmap::with_capacity(values.len()); + let mut validity = BitmapBuilder::with_capacity(values.len()); values.iter().for_each(|v| { let is_valid = !v.inner.is_none(py); // SAFETY: we can ensure that validity has correct capacity. @@ -304,7 +304,7 @@ impl PySeries { let ca = ObjectChunked::::new_from_vec_and_validity( name.into(), values, - validity.into(), + validity.into_opt_validity(), ); let s = ca.into_series(); s.into() diff --git a/crates/polars-row/src/decode.rs b/crates/polars-row/src/decode.rs index 3e7cf79be06d..0a5e862798b5 100644 --- a/crates/polars-row/src/decode.rs +++ b/crates/polars-row/src/decode.rs @@ -1,4 +1,4 @@ -use arrow::bitmap::{Bitmap, MutableBitmap}; +use arrow::bitmap::{Bitmap, BitmapBuilder}; use arrow::buffer::Buffer; use arrow::datatypes::ArrowDataType; use arrow::offset::OffsetsBuffer; @@ -62,16 +62,16 @@ unsafe fn decode_validity(rows: &mut [&[u8]], opt: RowEncodingOptions) -> Option // No nulls just return None let first_null = first_null?; - let mut bm = MutableBitmap::new(); + let mut bm = BitmapBuilder::new(); bm.reserve(rows.len()); bm.extend_constant(first_null, true); bm.push(false); - bm.extend_from_trusted_len_iter(rows[first_null + 1..].iter_mut().map(|row| { + bm.extend_trusted_len_iter(rows[first_null + 1..].iter_mut().map(|row| { let v; (v, *row) = row.split_at_unchecked(1); v[0] != null_sentinel })); - Some(bm.freeze()) + bm.into_opt_validity() } // We inline this in an attempt to avoid the dispatch cost. @@ -240,7 +240,7 @@ unsafe fn decode( FixedSizeListArray::new(dtype.clone(), rows.len(), values, validity).to_boxed() }, D::List(list_field) | D::LargeList(list_field) => { - let mut validity = MutableBitmap::new(); + let mut validity = BitmapBuilder::new(); // @TODO: we could consider making this into a scratchpad let num_rows = rows.len(); @@ -281,7 +281,7 @@ unsafe fn decode( None } else { validity.extend_constant(num_rows - validity.len(), true); - Some(validity.freeze()) + validity.into_opt_validity() }; assert_eq!(offsets.len(), rows.len() + 1); diff --git a/crates/polars-row/src/fixed/decimal.rs b/crates/polars-row/src/fixed/decimal.rs index b77e91e8df7b..589b3e64ad05 100644 --- a/crates/polars-row/src/fixed/decimal.rs +++ b/crates/polars-row/src/fixed/decimal.rs @@ -6,7 +6,7 @@ use std::mem::MaybeUninit; use arrow::array::{Array, PrimitiveArray}; -use arrow::bitmap::MutableBitmap; +use arrow::bitmap::BitmapBuilder; use arrow::datatypes::ArrowDataType; use polars_utils::slice::Slice2Uninit; @@ -205,7 +205,7 @@ pub unsafe fn decode( return PrimitiveArray::new(ArrowDataType::Int128, values.into(), None); } - let mut validity = MutableBitmap::with_capacity(rows.len()); + let mut validity = BitmapBuilder::with_capacity(rows.len()); validity.extend_constant(values.len(), true); let start_len = values.len(); @@ -238,6 +238,6 @@ pub unsafe fn decode( PrimitiveArray::new( ArrowDataType::Int128, values.into(), - Some(validity.freeze()), + validity.into_opt_validity(), ) } diff --git a/crates/polars-row/src/fixed/packed_u32.rs b/crates/polars-row/src/fixed/packed_u32.rs index ac252d4376a3..41b6fb550e3e 100644 --- a/crates/polars-row/src/fixed/packed_u32.rs +++ b/crates/polars-row/src/fixed/packed_u32.rs @@ -6,7 +6,7 @@ use std::mem::MaybeUninit; use arrow::array::{Array, PrimitiveArray}; -use arrow::bitmap::MutableBitmap; +use arrow::bitmap::BitmapBuilder; use arrow::datatypes::ArrowDataType; use polars_utils::slice::Slice2Uninit; @@ -154,7 +154,7 @@ pub unsafe fn decode( return PrimitiveArray::new(ArrowDataType::UInt32, values.into(), None); } - let mut validity = MutableBitmap::with_capacity(rows.len()); + let mut validity = BitmapBuilder::with_capacity(rows.len()); validity.extend_constant(values.len(), true); let start_len = values.len(); @@ -175,6 +175,6 @@ pub unsafe fn decode( PrimitiveArray::new( ArrowDataType::UInt32, values.into(), - Some(validity.freeze()), + validity.into_opt_validity(), ) } diff --git a/crates/polars-row/src/utils.rs b/crates/polars-row/src/utils.rs index 5b5259b2847c..2f46fbff5105 100644 --- a/crates/polars-row/src/utils.rs +++ b/crates/polars-row/src/utils.rs @@ -1,4 +1,4 @@ -use arrow::bitmap::{Bitmap, MutableBitmap}; +use arrow::bitmap::{Bitmap, BitmapBuilder}; #[macro_export] macro_rules! with_match_arrow_primitive_type {( @@ -27,15 +27,15 @@ pub(crate) unsafe fn decode_opt_nulls(rows: &[&[u8]], null_sentinel: u8) -> Opti .iter() .position(|row| *row.get_unchecked(0) == null_sentinel)?; - let mut bm = MutableBitmap::with_capacity(rows.len()); + let mut bm = BitmapBuilder::with_capacity(rows.len()); bm.extend_constant(first_null, true); bm.push(false); - bm.extend_from_trusted_len_iter_unchecked( + bm.extend_trusted_len_iter( rows[first_null + 1..] .iter() .map(|row| *row.get_unchecked(0) != null_sentinel), ); - Some(bm.freeze()) + bm.into_opt_validity() } diff --git a/crates/polars-row/src/variable/no_order.rs b/crates/polars-row/src/variable/no_order.rs index 0619125af714..cba0c4f328ec 100644 --- a/crates/polars-row/src/variable/no_order.rs +++ b/crates/polars-row/src/variable/no_order.rs @@ -11,7 +11,7 @@ use std::mem::MaybeUninit; use arrow::array::{BinaryViewArray, MutableBinaryViewArray}; -use arrow::bitmap::MutableBitmap; +use arrow::bitmap::BitmapBuilder; use polars_utils::slice::Slice2Uninit; use crate::row::RowEncodingOptions; @@ -90,7 +90,7 @@ pub unsafe fn decode_variable_no_order( let num_rows = rows.len(); let mut array = MutableBinaryViewArray::<[u8]>::with_capacity(num_rows); - let mut validity = MutableBitmap::new(); + let mut validity = BitmapBuilder::new(); for row in rows.iter_mut() { let sentinel = *unsafe { row.get_unchecked(0) }; @@ -142,5 +142,5 @@ pub unsafe fn decode_variable_no_order( } let array = array.freeze(); - array.with_validity(Some(validity.freeze())) + array.with_validity(validity.into_opt_validity()) } diff --git a/crates/polars-row/src/variable/utf8.rs b/crates/polars-row/src/variable/utf8.rs index 6d0b473ad47a..68a4f3df5c2d 100644 --- a/crates/polars-row/src/variable/utf8.rs +++ b/crates/polars-row/src/variable/utf8.rs @@ -10,7 +10,7 @@ use std::mem::MaybeUninit; use arrow::array::{MutableBinaryViewArray, Utf8ViewArray}; -use arrow::bitmap::MutableBitmap; +use arrow::bitmap::BitmapBuilder; use crate::row::RowEncodingOptions; @@ -98,7 +98,7 @@ pub unsafe fn decode_str(rows: &mut [&[u8]], opt: RowEncodingOptions) -> Utf8Vie return array.into(); } - let mut validity = MutableBitmap::with_capacity(num_rows); + let mut validity = BitmapBuilder::with_capacity(num_rows); validity.extend_constant(array.len(), true); validity.push(false); array.push_value_ignore_validity(""); @@ -124,5 +124,5 @@ pub unsafe fn decode_str(rows: &mut [&[u8]], opt: RowEncodingOptions) -> Utf8Vie } let out: Utf8ViewArray = array.into(); - out.with_validity(Some(validity.freeze())) + out.with_validity(validity.into_opt_validity()) } diff --git a/crates/polars-stream/src/nodes/io_sources/parquet/row_group_decode.rs b/crates/polars-stream/src/nodes/io_sources/parquet/row_group_decode.rs index 8e4ddce836a1..978a2c5b2307 100644 --- a/crates/polars-stream/src/nodes/io_sources/parquet/row_group_decode.rs +++ b/crates/polars-stream/src/nodes/io_sources/parquet/row_group_decode.rs @@ -6,7 +6,7 @@ use polars_core::prelude::{ }; use polars_core::scalar::Scalar; use polars_core::series::{IsSorted, Series}; -use polars_core::utils::arrow::bitmap::{Bitmap, MutableBitmap}; +use polars_core::utils::arrow::bitmap::{Bitmap, BitmapBuilder}; use polars_error::{polars_bail, PolarsResult}; use polars_io::hive; use polars_io::predicates::PhysicalIoExpr; @@ -587,7 +587,7 @@ impl RowGroupDecoder { } let mask_bitmap = { - let mut mask_bitmap = MutableBitmap::with_capacity(mask.len()); + let mut mask_bitmap = BitmapBuilder::with_capacity(mask.len()); for chunk in mask.downcast_iter() { match chunk.validity() {