Skip to content

Commit

Permalink
fix(rust): properly set boolean distinct count (#16782)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Jun 6, 2024
1 parent 90505af commit 1fbfa08
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 11 deletions.
10 changes: 10 additions & 0 deletions crates/polars-arrow/src/bitmap/bitmap_ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,16 @@ fn eq(lhs: &Bitmap, rhs: &Bitmap) -> bool {
lhs_remainder.zip(rhs_remainder).all(|(x, y)| x == y)
}

pub fn num_intersections_with(lhs: &Bitmap, rhs: &Bitmap) -> usize {
binary_fold(
lhs,
rhs,
|lhs, rhs| (lhs & rhs).count_ones() as usize,
0,
|lhs, rhs| lhs + rhs,
)
}

pub fn intersects_with(lhs: &Bitmap, rhs: &Bitmap) -> bool {
binary_fold(
lhs,
Expand Down
7 changes: 6 additions & 1 deletion crates/polars-arrow/src/bitmap/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use either::Either;
use polars_error::{polars_bail, PolarsResult};

use super::utils::{count_zeros, fmt, get_bit, get_bit_unchecked, BitChunk, BitChunks, BitmapIter};
use super::{chunk_iter_to_vec, intersects_with, IntoIter, MutableBitmap};
use super::{chunk_iter_to_vec, intersects_with, num_intersections_with, IntoIter, MutableBitmap};
use crate::array::Splitable;
use crate::bitmap::aligned::AlignedBitmapSlice;
use crate::bitmap::iterator::{
Expand Down Expand Up @@ -480,6 +480,11 @@ impl Bitmap {
pub fn intersects_with(&self, other: &Self) -> bool {
intersects_with(self, other)
}

/// Calculates the number of shared set bits between two [`Bitmap`]s.
pub fn num_intersections_with(&self, other: &Self) -> usize {
num_intersections_with(self, other)
}
}

impl<P: AsRef<[bool]>> From<P> for Bitmap {
Expand Down
64 changes: 54 additions & 10 deletions crates/polars-compute/src/distinct_count.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,68 @@
use arrow::array::{Array, BooleanArray};

/// Kernel to calculate the number of unique non-null elements
/// Kernel to calculate the number of unique elements
///
/// A null is also considered a unique value
pub trait DistinctCountKernel {
/// Calculate the number of unique non-null elements in [`Self`]
/// Calculate the number of unique elements in [`Self`]
///
/// A null is also considered a unique value
fn distinct_count(&self) -> usize;
}

impl DistinctCountKernel for BooleanArray {
fn distinct_count(&self) -> usize {
if self.len() - self.null_count() == 0 {
if self.len() == 0 {
return 0;
}

if self.null_count() == 0 {
let unset_bits = self.values().unset_bits();
2 - usize::from(unset_bits == 0 || unset_bits == self.values().len())
} else {
let values = self.values() & self.validity().unwrap();
let unset_bits = self.values().unset_bits();
3 - usize::from(unset_bits == 0 || unset_bits == values.len())
let null_count = self.null_count();

if self.len() == null_count {
return 1;
}

let values = self.values();

if null_count == 0 {
let unset_bits = values.unset_bits();
let is_uniform = unset_bits == 0 || unset_bits == values.len();
return 2 - usize::from(is_uniform);
}

let validity = self.validity().unwrap();
let set_bits = values.num_intersections_with(validity);
let is_uniform = set_bits == 0 || set_bits == validity.set_bits();
2 + usize::from(!is_uniform)
}
}

#[test]
fn test_boolean_distinct_count() {
use arrow::bitmap::Bitmap;
use arrow::datatypes::ArrowDataType;

macro_rules! assert_bool_dc {
($values:expr, $validity:expr => $dc:expr) => {
let validity: Option<Bitmap> =
<Option<Vec<bool>>>::map($validity, |v| Bitmap::from_iter(v));
let arr =
BooleanArray::new(ArrowDataType::Boolean, Bitmap::from_iter($values), validity);
assert_eq!(arr.distinct_count(), $dc);
};
}

assert_bool_dc!(vec![], None => 0);
assert_bool_dc!(vec![], Some(vec![]) => 0);
assert_bool_dc!(vec![true], None => 1);
assert_bool_dc!(vec![true], Some(vec![true]) => 1);
assert_bool_dc!(vec![true], Some(vec![false]) => 1);
assert_bool_dc!(vec![true, false], None => 2);
assert_bool_dc!(vec![true, false, false], None => 2);
assert_bool_dc!(vec![true, false, false], Some(vec![true, true, false]) => 3);

// Copied from https://github.com/pola-rs/polars/pull/16765#discussion_r1629426159
assert_bool_dc!(vec![true, true, true, true, true], Some(vec![true, false, true, false, false]) => 2);
assert_bool_dc!(vec![false, true, false, true, true], Some(vec![true, false, true, false, false]) => 2);
assert_bool_dc!(vec![true, false, true, false, true, true], Some(vec![true, true, false, true, false, false]) => 3);
}

0 comments on commit 1fbfa08

Please sign in to comment.