Skip to content

Commit

Permalink
feat: Normalize value_counts (#16917)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Jun 13, 2024
1 parent 6711d4c commit e98f5cc
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 17 deletions.
17 changes: 16 additions & 1 deletion crates/polars-ops/src/series/ops/various.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@ use crate::series::ops::SeriesSealed;
pub trait SeriesMethods: SeriesSealed {
/// Create a [`DataFrame`] with the unique `values` of this [`Series`] and a column `"counts"`
/// with dtype [`IdxType`]
fn value_counts(&self, sort: bool, parallel: bool, name: String) -> PolarsResult<DataFrame> {
fn value_counts(
&self,
sort: bool,
parallel: bool,
name: String,
normalize: bool,
) -> PolarsResult<DataFrame> {
let s = self.as_series();
polars_ensure!(
s.name() != name,
Expand All @@ -21,6 +27,15 @@ pub trait SeriesMethods: SeriesSealed {
let groups = s.group_tuples(parallel, sort)?;
let values = unsafe { s.agg_first(&groups) };
let counts = groups.group_count().with_name(name.as_str());

let counts = if normalize {
let len = s.len() as f64;
let counts: Float64Chunked = counts.apply_values_generic(|count| count as f64 / len);
counts.into_series()
} else {
counts.into_series()
};

let cols = vec![values, counts.into_series()];
let df = unsafe { DataFrame::new_no_checks(cols) };
if sort {
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-plan/src/dsl/function_expr/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@ pub(super) fn value_counts(
sort: bool,
parallel: bool,
name: String,
normalize: bool,
) -> PolarsResult<Series> {
s.value_counts(sort, parallel, name)
s.value_counts(sort, parallel, name, normalize)
.map(|df| df.into_struct(s.name()).into_series())
}

Expand Down
12 changes: 11 additions & 1 deletion crates/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ pub enum FunctionExpr {
sort: bool,
parallel: bool,
name: String,
normalize: bool,
},
#[cfg(feature = "unique_counts")]
UniqueCounts,
Expand Down Expand Up @@ -464,10 +465,12 @@ impl Hash for FunctionExpr {
sort,
parallel,
name,
normalize,
} => {
sort.hash(state);
parallel.hash(state);
name.hash(state);
normalize.hash(state);
},
#[cfg(feature = "unique_counts")]
UniqueCounts => {},
Expand Down Expand Up @@ -997,7 +1000,14 @@ impl From<FunctionExpr> for SpecialEq<Arc<dyn SeriesUdf>> {
sort,
parallel,
name,
} => map!(dispatch::value_counts, sort, parallel, name.clone()),
normalize,
} => map!(
dispatch::value_counts,
sort,
parallel,
name.clone(),
normalize
),
#[cfg(feature = "unique_counts")]
UniqueCounts => map!(dispatch::unique_counts),
Reverse => map!(dispatch::reverse),
Expand Down
8 changes: 7 additions & 1 deletion crates/polars-plan/src/dsl/function_expr/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,16 @@ impl FunctionExpr {
sort: _,
parallel: _,
name,
normalize,
} => mapper.map_dtype(|dt| {
let count_dt = if *normalize {
DataType::Float64
} else {
IDX_DTYPE
};
DataType::Struct(vec![
Field::new(fields[0].name().as_str(), dt.clone()),
Field::new(name, IDX_DTYPE),
Field::new(name, count_dt),
])
}),
#[cfg(feature = "unique_counts")]
Expand Down
3 changes: 2 additions & 1 deletion crates/polars-plan/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1729,11 +1729,12 @@ impl Expr {
#[cfg(feature = "dtype-struct")]
/// Count all unique values and create a struct mapping value to count.
/// (Note that it is better to turn parallel off in the aggregation context).
pub fn value_counts(self, sort: bool, parallel: bool, name: String) -> Self {
pub fn value_counts(self, sort: bool, parallel: bool, name: String, normalize: bool) -> Self {
self.apply_private(FunctionExpr::ValueCounts {
sort,
parallel,
name,
normalize,
})
.with_function_options(|mut opts| {
opts.pass_name_to_apply = true;
Expand Down
4 changes: 2 additions & 2 deletions docs/src/rust/user-guide/expressions/structs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let out = ratings
.clone()
.lazy()
.select([col("Theatre").value_counts(true, true, "count".to_string())])
.select([col("Theatre").value_counts(true, true, "count".to_string(), false)])
.collect()?;
println!("{}", &out);
// --8<-- [end:state_value_counts]
Expand All @@ -26,7 +26,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let out = ratings
.clone()
.lazy()
.select([col("Theatre").value_counts(true, true, "count".to_string())])
.select([col("Theatre").value_counts(true, true, "count".to_string(), false)])
.unnest(["Theatre"])
.collect()?;
println!("{}", &out);
Expand Down
22 changes: 19 additions & 3 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9613,7 +9613,12 @@ def extend_constant(self, value: IntoExpr, n: int | IntoExprColumn) -> Self:
return self._from_pyexpr(self._pyexpr.extend_constant(value, n))

def value_counts(
self, *, sort: bool = False, parallel: bool = False, name: str = "count"
self,
*,
sort: bool = False,
parallel: bool = False,
name: str | None = None,
normalize: bool = False,
) -> Self:
"""
Count the occurrences of unique values.
Expand All @@ -9630,7 +9635,11 @@ def value_counts(
This option should likely not be enabled in a group by context,
as the computation is already parallelized per group.
name
Give the resulting count field a specific name; defaults to "count".
Give the resulting count column a specific name;
if `normalize` is True defaults to "count",
otherwise defaults to "proportion".
normalize
If true gives relative frequencies of the unique values
Returns
-------
Expand Down Expand Up @@ -9682,7 +9691,14 @@ def value_counts(
│ green ┆ 1 │
└───────┴─────┘
"""
return self._from_pyexpr(self._pyexpr.value_counts(sort, parallel, name))
if name is None:
if normalize:
name = "proportion"
else:
name = "count"
return self._from_pyexpr(
self._pyexpr.value_counts(sort, parallel, name, normalize)
)

def unique_counts(self) -> Self:
"""
Expand Down
22 changes: 19 additions & 3 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2399,7 +2399,12 @@ def hist(
return out.struct.unnest()

def value_counts(
self, *, sort: bool = False, parallel: bool = False, name: str = "count"
self,
*,
sort: bool = False,
parallel: bool = False,
name: str | None = None,
normalize: bool = False,
) -> DataFrame:
"""
Count the occurrences of unique values.
Expand All @@ -2416,7 +2421,11 @@ def value_counts(
This option should likely not be enabled in a group by context,
as the computation is already parallelized per group.
name
Give the resulting count column a specific name; defaults to "count".
Give the resulting count column a specific name;
if `normalize` is True defaults to "count",
otherwise defaults to "proportion".
normalize
If true gives relative frequencies of the unique values
Returns
-------
Expand Down Expand Up @@ -2452,8 +2461,15 @@ def value_counts(
│ green ┆ 1 │
└───────┴─────┘
"""
if name is None:
if normalize:
name = "proportion"
else:
name = "count"
return pl.DataFrame._from_pydf(
self._s.value_counts(sort=sort, parallel=parallel, name=name)
self._s.value_counts(
sort=sort, parallel=parallel, name=name, normalize=normalize
)
)

def unique_counts(self) -> Series:
Expand Down
7 changes: 5 additions & 2 deletions py-polars/src/expr/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,11 @@ impl PyExpr {
fn len(&self) -> Self {
self.inner.clone().len().into()
}
fn value_counts(&self, sort: bool, parallel: bool, name: String) -> Self {
self.inner.clone().value_counts(sort, parallel, name).into()
fn value_counts(&self, sort: bool, parallel: bool, name: String, normalize: bool) -> Self {
self.inner
.clone()
.value_counts(sort, parallel, name, normalize)
.into()
}
fn unique_counts(&self) -> Self {
self.inner.clone().unique_counts().into()
Expand Down
1 change: 1 addition & 0 deletions py-polars/src/lazyframe/visitor/expr_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1126,6 +1126,7 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
sort: _,
parallel: _,
name: _,
normalize: _,
} => return Err(PyNotImplementedError::new_err("value counts")),
FunctionExpr::UniqueCounts => ("unique_counts",).to_object(py),
FunctionExpr::ApproxNUnique => {
Expand Down
10 changes: 8 additions & 2 deletions py-polars/src/series/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -755,10 +755,16 @@ impl PySeries {
self.series.tail(Some(n)).into()
}

fn value_counts(&self, sort: bool, parallel: bool, name: String) -> PyResult<PyDataFrame> {
fn value_counts(
&self,
sort: bool,
parallel: bool,
name: String,
normalize: bool,
) -> PyResult<PyDataFrame> {
let out = self
.series
.value_counts(sort, parallel, name)
.value_counts(sort, parallel, name, normalize)
.map_err(PyPolarsErr::from)?;
Ok(out.into())
}
Expand Down
9 changes: 9 additions & 0 deletions py-polars/tests/unit/operations/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ def test_value_counts() -> None:
result_sorted = result.sort("a")
assert_frame_equal(result_sorted, expected)

out = pl.Series("a", [12, 3345, 12, 3, 4, 4, 1, 12]).value_counts(
normalize=True, sort=True
)
assert out["proportion"].sum() == 1.0
assert out.to_dict(as_series=False) == {
"a": [12, 4, 3345, 3, 1],
"proportion": [0.375, 0.25, 0.125, 0.125, 0.125],
}


def test_value_counts_logical_type() -> None:
# test logical type
Expand Down

0 comments on commit e98f5cc

Please sign in to comment.