Skip to content

Commit

Permalink
fix: Fix more global categorical issues (#20547)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Jan 3, 2025
1 parent 7c64640 commit 1517599
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 29 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ jobs:
const previousSizeMB = previousSize !== 'Unknown' ? (previousSize / 1024 / 1024).toFixed(4) : 'Unknown';
const currentSizeMB = currentSize !== 'Unknown' ? (currentSize / 1024 / 1024).toFixed(4) : 'Unknown';
let commentBody = `The previous wheel size was **${previousSizeMB} MB**.\nThe current wheel size after this PR is **${currentSizeMB} MB**.`;
let commentBody = `The uncompressed binary size was **${previousSizeMB} MB**.\nThe uncompressed binary size after this PR is **${currentSizeMB} MB**.`;
// Calculate percentage increase if both sizes are available
if (previousSize !== 'Unknown' && currentSize !== '') {
Expand Down
30 changes: 23 additions & 7 deletions crates/polars-core/src/chunked_array/comparison/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -374,13 +374,29 @@ where
// Apply comparison on categories map and then do a lookup
let bitmap = str_single_compare_function(lhs.get_rev_map().get_categories(), rhs);

Ok(
BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(|opt_idx| {
// SAFETY: indexing into bitmap with same length as original array
opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) })
}))
.with_name(lhs.name().clone()),
)
let mask = match lhs.get_rev_map().as_ref() {
RevMapping::Local(_, _) => {
BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(
|opt_idx| {
// SAFETY: indexing into bitmap with same length as original array
opt_idx.map(|idx| unsafe { bitmap.get_bit_unchecked(idx as usize) })
},
))
},
RevMapping::Global(idx_map, _, _) => {
BooleanChunked::from_iter_trusted_length(lhs.physical().into_iter().map(
|opt_idx| {
// SAFETY: indexing into bitmap with same length as original array
opt_idx.map(|idx| unsafe {
let idx = *idx_map.get(&idx).unwrap();
bitmap.get_bit_unchecked(idx as usize)
})
},
))
},
};

Ok(mask.with_name(lhs.name().clone()))
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use polars_compute::unique::{DictionaryRangedUniqueState, RangedUniqueKernel};

use super::*;

impl CategoricalChunked {
Expand Down Expand Up @@ -41,18 +39,7 @@ impl CategoricalChunked {
Ok(out)
}
} else {
let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed());
for chunk in self.physical().downcast_iter() {
state.key_state().append(chunk);
}
let (_, unique, _) = state.finalize_unique().take();
let ca = unsafe {
UInt32Chunked::from_chunks_and_dtype_unchecked(
self.physical().name().clone(),
vec![unique.to_boxed()],
DataType::UInt32,
)
};
let ca = self.physical().unique()?;
// SAFETY:
// we only removed some indexes so we are still in bounds
unsafe {
Expand All @@ -70,12 +57,7 @@ impl CategoricalChunked {
if self._can_fast_unique() {
Ok(self.get_rev_map().len())
} else {
let cat_map = self.get_rev_map();
let mut state = DictionaryRangedUniqueState::new(cat_map.get_categories().to_boxed());
for chunk in self.physical().downcast_iter() {
state.key_state().append(chunk);
}
Ok(state.finalize_n_unique())
self.physical().n_unique()
}
}

Expand Down
21 changes: 20 additions & 1 deletion py-polars/tests/unit/datatypes/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,7 @@ def test_perfect_group_by_19950() -> None:
def test_categorical_unique() -> None:
s = pl.Series(["a", "b", None], dtype=pl.Categorical)
assert s.n_unique() == 3
assert s.unique().to_list() == ["a", "b", None]
assert s.unique().sort().to_list() == [None, "a", "b"]


@StringCache()
Expand All @@ -925,3 +925,22 @@ def test_categorical_unique_20539() -> None:
"unique": [["a", "b"], ["b", "c"], ["c"]],
"unique_with_order": [["a", "b"], ["b", "c"], ["c"]],
}


@StringCache()
@pytest.mark.may_fail_auto_streaming
def test_categorical_prefill() -> None:
# https://github.com/pola-rs/polars/pull/20547#issuecomment-2569473443
# prefill cache
pl.Series(["aaa", "bbb", "ccc"], dtype=pl.Categorical) # pre-fill cache

# test_compare_categorical_single
assert (pl.Series(["a"], dtype=pl.Categorical) < "a").to_list() == [False]

# test_unique_categorical
a = pl.Series(["a"], dtype=pl.Categorical)
assert a.unique().to_list() == ["a"]

s = pl.Series(["1", "2", "3"], dtype=pl.Categorical)
s = s.filter([True, False, True])
assert s.n_unique() == 2

0 comments on commit 1517599

Please sign in to comment.