Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python, rust): implement unique counts for boolean datatype #16588

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
71 changes: 71 additions & 0 deletions crates/polars-ops/src/series/ops/unique.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::hash::Hash;

use arrow::array::Array;
use polars_compute::distinct_count::DistinctCountKernel;
use polars_core::hashing::_HASHMAP_INIT_SIZE;
use polars_core::prelude::*;
use polars_core::utils::NoNull;
Expand All @@ -25,6 +27,71 @@ where
out.into_inner()
}

fn unique_counts_boolean_helper(ca: &BooleanChunked) -> IdxCa {
if ca.is_empty() {
return IdxCa::new(ca.name(), [] as [IdxSize; 0]);
}

let ca = ca.rechunk();
Copy link
Contributor

@mcrumiller mcrumiller Jun 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ritchie46 can answer better here, but I think if we're spending the energy rechunking a series (which mem copies), the original array should receive the benefit as well, which you can do by making ca mut and calling *ca.rechunk().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be. However, I can't see this being done anywhere else in the codebase, which makes me question whether it should be done, so will wait for Ritchie's answer.

I guess you meant @ritchie46 btw.

let arr = ca.downcast_iter().next().unwrap();

if arr.distinct_count() == 1 {
return IdxCa::new(ca.name(), [arr.len() as IdxSize]);
}

let (n_true, n_null);
if let Some(validity) = arr.validity() {
n_null = validity.unset_bits();
n_true = (arr.values() & validity).set_bits();
} else {
n_null = 0;
n_true = arr.values().set_bits();
}
let n_false = arr.len() - n_true - n_null;
let (n_true, n_false, n_null) = (n_true as IdxSize, n_false as IdxSize, n_null as IdxSize);

if n_true == 0 {
match arr.is_null(0) {
true => return IdxCa::new(ca.name(), [n_null, n_false]),
false => return IdxCa::new(ca.name(), [n_false, n_null]),
}
} else if n_false == 0 {
match arr.is_null(0) {
true => return IdxCa::new(ca.name(), [n_null, n_true]),
false => return IdxCa::new(ca.name(), [n_true, n_null]),
}
} else if n_null == 0 {
match arr.value(0) {
true => return IdxCa::new(ca.name(), [n_true, n_false]),
false => return IdxCa::new(ca.name(), [n_false, n_true]),
}
}

henrikig marked this conversation as resolved.
Show resolved Hide resolved
if arr.is_null(0) {
let first_non_null_idx = arr.validity().unwrap().iter().position(|v| v).unwrap();
match arr.value(first_non_null_idx) {
true => return IdxCa::new(ca.name(), [n_null, n_true, n_false]),
false => return IdxCa::new(ca.name(), [n_null, n_false, n_true]),
}
} else {
let first_val = arr.value(0);
let second_val_idx = arr
.validity()
.unwrap()
.iter()
.zip(arr.values())
.position(|(v, val)| !v || val != first_val)
.unwrap();

match (first_val, arr.is_null(second_val_idx)) {
(true, true) => return IdxCa::new(ca.name(), [n_true, n_null, n_false]),
(true, false) => return IdxCa::new(ca.name(), [n_true, n_false, n_null]),
(false, true) => return IdxCa::new(ca.name(), [n_false, n_null, n_true]),
(false, false) => return IdxCa::new(ca.name(), [n_false, n_true, n_null]),
}
}
}

/// Returns a count of the unique values in the order of appearance.
pub fn unique_counts(s: &Series) -> PolarsResult<Series> {
if s.dtype().to_physical().is_numeric() {
Expand All @@ -39,6 +106,10 @@ pub fn unique_counts(s: &Series) -> PolarsResult<Series> {
DataType::String => {
Ok(unique_counts_helper(s.str().unwrap().into_iter()).into_series())
},
DataType::Boolean => {
let ca = s.bool().unwrap();
Ok(unique_counts_boolean_helper(ca).into_series())
},
DataType::Null => {
let ca = if s.is_empty() {
IdxCa::new(s.name(), [] as [IdxSize; 0])
Expand Down
32 changes: 32 additions & 0 deletions py-polars/tests/unit/operations/unique/test_unique_counts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from __future__ import annotations

from datetime import datetime

import pytest

import polars as pl
from polars.testing import assert_series_equal

Expand Down Expand Up @@ -41,3 +45,31 @@ def test_unique_counts_null() -> None:
s = pl.Series([None, None, None])
expected = pl.Series([3], dtype=pl.UInt32)
assert_series_equal(s.unique_counts(), expected)


@pytest.mark.parametrize(
("input", "expected"),
[
([], []),
([None, None, None], [3]),
([True, True, True], [3]),
([False, False, False], [3]),
([True, False, False, True, True], [3, 2]),
([False, True, False, True, True], [2, 3]),
([True, None, True, None, None], [2, 3]),
([None, True, None, True, True], [2, 3]),
([False, None, False, None, None], [2, 3]),
([None, False, None, False, False], [2, 3]),
([True, False, None, False, None, None], [1, 2, 3]),
([True, None, False, False, None, False], [1, 2, 3]),
([False, True, True, None, None, None], [1, 2, 3]),
([False, None, True, True, None, True], [1, 2, 3]),
([None, False, True, False, True, True], [1, 2, 3]),
([None, True, True, False, False, False], [1, 2, 3]),
],
)
def test_unique_counts_bool(input: list[bool], expected: list[int]) -> None:
assert_series_equal(
pl.Series("bool", input, dtype=pl.Boolean).unique_counts(),
pl.Series("bool", expected, dtype=pl.UInt32),
)