Skip to content

Commit

Permalink
perf: Half the size of Booleans in row encoding (#19927)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Nov 23, 2024
1 parent 414d883 commit 5eeb369
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 70 deletions.
32 changes: 21 additions & 11 deletions crates/polars-arrow/src/bitmap/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -428,18 +428,15 @@ impl<P: AsRef<[bool]>> From<P> for MutableBitmap {
}
}

impl FromIterator<bool> for MutableBitmap {
fn from_iter<I>(iter: I) -> Self
where
I: IntoIterator<Item = bool>,
{
impl Extend<bool> for MutableBitmap {
fn extend<T: IntoIterator<Item = bool>>(&mut self, iter: T) {
let mut iterator = iter.into_iter();
let mut buffer = {
let byte_capacity: usize = iterator.size_hint().0.saturating_add(7) / 8;
Vec::with_capacity(byte_capacity)
};

let mut length = 0;
let mut buffer = std::mem::take(&mut self.buffer);
let mut length = std::mem::take(&mut self.length);

let byte_capacity: usize = iterator.size_hint().0.saturating_add(7) / 8;
buffer.reserve(byte_capacity);

loop {
let mut exhausted = false;
Expand Down Expand Up @@ -481,7 +478,20 @@ impl FromIterator<bool> for MutableBitmap {
break;
}
}
Self { buffer, length }

self.buffer = buffer;
self.length = length;
}
}

impl FromIterator<bool> for MutableBitmap {
fn from_iter<I>(iter: I) -> Self
where
I: IntoIterator<Item = bool>,
{
let mut bm = Self::new();
bm.extend(iter);
bm
}
}

Expand Down
4 changes: 2 additions & 2 deletions crates/polars-row/src/encode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ unsafe fn encode_flat_array(
},
D::Boolean => {
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
crate::fixed::encode_iter(buffer, array.iter(), field, offsets);
crate::fixed::encode_bool_iter(buffer, array.iter(), field, offsets);
},
dt if dt.is_numeric() => with_match_arrow_primitive_type!(dt, |$T| {
let array = array.as_any().downcast_ref::<PrimitiveArray<$T>>().unwrap();
Expand Down Expand Up @@ -815,7 +815,7 @@ pub fn fixed_size(dtype: &ArrowDataType) -> Option<usize> {
Decimal(_, _) => i128::ENCODED_LEN,
Float32 => f32::ENCODED_LEN,
Float64 => f64::ENCODED_LEN,
Boolean => bool::ENCODED_LEN,
Boolean => 1,
FixedSizeList(f, width) => 1 + width * fixed_size(f.dtype())?,
Struct(fs) => {
let mut sum = 0;
Expand Down
101 changes: 60 additions & 41 deletions crates/polars-row/src/fixed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::fmt::Debug;
use std::mem::MaybeUninit;

use arrow::array::{BooleanArray, PrimitiveArray};
use arrow::bitmap::Bitmap;
use arrow::bitmap::{Bitmap, MutableBitmap};
use arrow::datatypes::ArrowDataType;
use arrow::types::NativeType;
use polars_utils::slice::*;
Expand Down Expand Up @@ -41,17 +41,6 @@ pub trait FixedLengthEncoding: Copy + Debug {
}
}

impl FixedLengthEncoding for bool {
type Encoded = [u8; 1];
fn encode(self) -> Self::Encoded {
[self as u8]
}

fn decode(encoded: Self::Encoded) -> Self {
encoded[0] != 0
}
}

// encode as big endian
macro_rules! encode_unsigned {
($n:expr, $t:ty) => {
Expand Down Expand Up @@ -216,6 +205,28 @@ pub(crate) unsafe fn encode_iter<I: Iterator<Item = Option<T>>, T: FixedLengthEn
}
}

pub(crate) unsafe fn encode_bool_iter<I: Iterator<Item = Option<bool>>>(
buffer: &mut [MaybeUninit<u8>],
input: I,
field: &EncodingField,
offsets: &mut [usize],
) {
let null_sentinel = get_null_sentinel(field);
let true_sentinel = field.bool_true_sentinel();
let false_sentinel = field.bool_false_sentinel();

for (offset, opt_value) in offsets.iter_mut().zip(input) {
let b = match opt_value {
None => null_sentinel,
Some(false) => false_sentinel,
Some(true) => true_sentinel,
};

*buffer.get_unchecked_mut(*offset) = MaybeUninit::new(b);
*offset += 1;
}
}

pub(super) unsafe fn decode_primitive<T: NativeType + FixedLengthEncoding>(
rows: &mut [&[u8]],
field: &EncodingField,
Expand Down Expand Up @@ -262,43 +273,51 @@ where
pub(super) unsafe fn decode_bool(rows: &mut [&[u8]], field: &EncodingField) -> BooleanArray {
let mut has_nulls = false;
let null_sentinel = get_null_sentinel(field);
let true_sentinel = field.bool_true_sentinel();

let values = Bitmap::from_trusted_len_iter_unchecked(rows.iter().map(|row| {
let b = *row.get_unchecked(0);
has_nulls |= b == null_sentinel;
b == true_sentinel
}));

if !has_nulls {
rows.iter_mut()
.for_each(|row| *row = row.get_unchecked(1..));
return BooleanArray::new(ArrowDataType::Boolean, values, None);
}

let values = rows
.iter()
.map(|row| {
has_nulls |= *row.get_unchecked(0) == null_sentinel;
// skip null sentinel
let start = 1;
let end = start + bool::ENCODED_LEN - 1;
let slice = row.get_unchecked(start..end);
let bytes = <bool as FixedLengthEncoding>::Encoded::from_slice(slice);

if field.descending {
bool::decode_reverse(bytes)
} else {
bool::decode(bytes)
}
})
.collect::<Bitmap>();

let validity = if has_nulls {
Some(decode_nulls(rows, null_sentinel))
} else {
None
};

// validity byte and data length
let increment_len = bool::ENCODED_LEN;

increment_row_counter(rows, increment_len);
BooleanArray::new(ArrowDataType::Boolean, values, validity)
let validity = Bitmap::from_trusted_len_iter_unchecked(rows.iter_mut().map(|row| {
let v = *row.get_unchecked(0) != null_sentinel;
*row = row.get_unchecked(1..);
v
}));
BooleanArray::new(ArrowDataType::Boolean, values, Some(validity))
}
unsafe fn increment_row_counter(rows: &mut [&[u8]], fixed_size: usize) {
for row in rows {
*row = row.get_unchecked(fixed_size..);
}
}

pub(super) unsafe fn decode_opt_nulls(rows: &[&[u8]], null_sentinel: u8) -> Option<Bitmap> {
let first_null = rows
.iter()
.position(|row| *row.get_unchecked(0) == null_sentinel)?;

let mut bm = MutableBitmap::with_capacity(rows.len());
bm.extend_constant(first_null, true);
bm.push(false);

bm.extend_from_trusted_len_iter_unchecked(
rows[first_null + 1..]
.iter()
.map(|row| *row.get_unchecked(0) != null_sentinel),
);

Some(bm.freeze())
}

pub(super) unsafe fn decode_nulls(rows: &[&[u8]], null_sentinel: u8) -> Bitmap {
rows.iter()
.map(|row| *row.get_unchecked(0) != null_sentinel)
Expand Down
19 changes: 19 additions & 0 deletions crates/polars-row/src/row.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ use arrow::datatypes::ArrowDataType;
use arrow::ffi::mmap;
use arrow::offset::{Offsets, OffsetsBuffer};

const BOOLEAN_TRUE_SENTINEL: u8 = 0x03;
const BOOLEAN_FALSE_SENTINEL: u8 = 0x02;

#[derive(Clone, Default, Copy)]
pub struct EncodingField {
/// Whether to sort in descending order
Expand All @@ -30,6 +33,22 @@ impl EncodingField {
..Default::default()
}
}

pub(crate) fn bool_true_sentinel(self) -> u8 {
if self.descending {
!BOOLEAN_TRUE_SENTINEL
} else {
BOOLEAN_TRUE_SENTINEL
}
}

pub(crate) fn bool_false_sentinel(self) -> u8 {
if self.descending {
!BOOLEAN_FALSE_SENTINEL
} else {
BOOLEAN_FALSE_SENTINEL
}
}
}

#[derive(Default, Clone)]
Expand Down
19 changes: 3 additions & 16 deletions crates/polars-row/src/variable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use arrow::datatypes::ArrowDataType;
use arrow::offset::Offsets;
use polars_utils::slice::Slice2Uninit;

use crate::fixed::{decode_nulls, get_null_sentinel};
use crate::fixed::{decode_opt_nulls, get_null_sentinel};
use crate::EncodingField;

/// The block size of the variable length encoding
Expand Down Expand Up @@ -183,11 +183,6 @@ pub(crate) unsafe fn encode_iter<'a, I: Iterator<Item = Option<&'a [u8]>>>(
}
}

unsafe fn has_nulls(rows: &[&[u8]], null_sentinel: u8) -> bool {
rows.iter()
.any(|row| *row.get_unchecked(0) == null_sentinel)
}

pub(crate) unsafe fn encoded_item_len(
row: &[u8],
non_empty_sentinel: u8,
Expand Down Expand Up @@ -305,11 +300,7 @@ pub(super) unsafe fn decode_binary(rows: &mut [&[u8]], field: &EncodingField) ->
};

let null_sentinel = get_null_sentinel(field);
let validity = if has_nulls(rows, null_sentinel) {
Some(decode_nulls(rows, null_sentinel))
} else {
None
};
let validity = decode_opt_nulls(rows, null_sentinel);
let values_cap = rows
.iter()
.map(|row| {
Expand Down Expand Up @@ -380,11 +371,7 @@ pub(super) unsafe fn decode_binview(rows: &mut [&[u8]], field: &EncodingField) -
};

let null_sentinel = get_null_sentinel(field);
let validity = if has_nulls(rows, null_sentinel) {
Some(decode_nulls(rows, null_sentinel))
} else {
None
};
let validity = decode_opt_nulls(rows, null_sentinel);

let mut mutable = MutableBinaryViewArray::with_capacity(rows.len());

Expand Down
28 changes: 28 additions & 0 deletions py-polars/tests/unit/test_row_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,34 @@ def test_parametric_binary_order(df: pl.DataFrame) -> None:
parametric_order_base(df)


def test_order_bool() -> None:
dtype = pl.Boolean
assert_order_series(
[None, False, True], [True, False, None], dtype, ["lt", "eq", "gt"]
)
assert_order_series(
[None, False, True],
[True, False, None],
dtype,
["gt", "eq", "lt"],
nulls_last=True,
)

assert_order_series(
[False, False, True, True],
[True, False, True, False],
dtype,
["lt", "eq", "eq", "gt"],
)
assert_order_series(
[False, False, True, True],
[True, False, True, False],
dtype,
["lt", "eq", "eq", "gt"],
descending=True,
)


def test_order_int() -> None:
dtype = pl.Int32
assert_order_series([1, 2, 3], [3, 2, 1], dtype, ["lt", "eq", "gt"])
Expand Down

0 comments on commit 5eeb369

Please sign in to comment.