Skip to content

Commit

Permalink
Refactor weight computation
Browse files Browse the repository at this point in the history
Fixes #292
  • Loading branch information
hadley committed Jan 4, 2023
1 parent 5d694dd commit ee63353
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 39 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# forcats (development version)

* `fct_lump_prop()` and friends now work correctly if you supply weights
and have empty levels (#292).

* `fct_expand()` gains an `after` argument so that you can choose where
the new levels are placed (#138).

Expand Down
59 changes: 30 additions & 29 deletions R/lump.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,13 @@
fct_lump <- function(f, n, prop, w = NULL, other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max")) {
ties.method <- match.arg(ties.method)
check_calc_levels(f, w)


if (missing(n) && missing(prop)) {
fct_lump_lowfreq(f, other_level = other_level)
fct_lump_lowfreq(f, w = w,other_level = other_level)
} else if (missing(prop)) {
fct_lump_n(f, n, w, other_level, ties.method)
fct_lump_n(f, n, w = w, other_level = other_level, ties.method = ties.method)
} else if (missing(n)) {
fct_lump_prop(f, prop, w, other_level)
fct_lump_prop(f, prop, w = w, other_level = other_level)
} else {
cli::cli_abort("Must supply only one of {.arg n} and {.arg prop}")
}
Expand All @@ -89,14 +87,14 @@ fct_lump <- function(f, n, prop, w = NULL, other_level = "Other",
#' @export
#' @rdname fct_lump
fct_lump_min <- function(f, min, w = NULL, other_level = "Other") {
calcs <- check_calc_levels(f, w)
f <- calcs$f
f <- check_factor(f)
level_w <- compute_weights(f, w)

if (!is.numeric(min) || length(min) != 1 || min < 0) {
cli::cli_abort("{.arg min} must be a positive number")
}

new_levels <- ifelse(calcs$count >= min, levels(f), other_level)
new_levels <- ifelse(level_w >= min, levels(f), other_level)

if (other_level %in% new_levels) {
f <- lvls_revalue(f, new_levels)
Expand All @@ -109,14 +107,20 @@ fct_lump_min <- function(f, min, w = NULL, other_level = "Other") {
#' @export
#' @rdname fct_lump
fct_lump_prop <- function(f, prop, w = NULL, other_level = "Other") {
calcs <- check_calc_levels(f, w)
f <- calcs$f
f <- check_factor(f)
level_w <- compute_weights(f, w)

if (!is.numeric(prop) || length(prop) != 1) {
cli::cli_abort("{.arg prop} must be a number")
}

prop_n <- calcs$count / calcs$total
# Compute proportion of total, including NAs
if (is.null(w)) {
prop_n <- level_w / length(f)
} else {
prop_n <- level_w / sum(w)
}

if (prop < 0) {
new_levels <- ifelse(prop_n <= -prop, levels(f), other_level)
} else {
Expand All @@ -141,19 +145,19 @@ fct_lump_prop <- function(f, prop, w = NULL, other_level = "Other") {
#' @rdname fct_lump
fct_lump_n <- function(f, n, w = NULL, other_level = "Other",
ties.method = c("min", "average", "first", "last", "random", "max")) {
f <- check_factor(f)
ties.method <- match.arg(ties.method)
calcs <- check_calc_levels(f, w)
f <- calcs$f
level_w <- compute_weights(f, w)

if (!is.numeric(n) || length(n) != 1) {
cli::cli_abort("{.arg n} must be a number")
}

if (n < 0) {
rank <- rank(calcs$count, ties.method = ties.method)
rank <- rank(level_w, ties.method = ties.method)
n <- -n
} else {
rank <- rank(-calcs$count, ties.method = ties.method)
rank <- rank(-level_w, ties.method = ties.method)
}

new_levels <- ifelse(rank <= n, levels(f), other_level)
Expand All @@ -173,11 +177,11 @@ fct_lump_n <- function(f, n, w = NULL, other_level = "Other",

#' @export
#' @rdname fct_lump
fct_lump_lowfreq <- function(f, other_level = "Other") {
calcs <- check_calc_levels(f, NULL)
f <- calcs$f
fct_lump_lowfreq <- function(f, w = NULL, other_level = "Other") {
f <- check_factor(f)
level_w <- compute_weights(f, w)

new_levels <- ifelse(!in_smallest(calcs$count), levels(f), other_level)
new_levels <- ifelse(!in_smallest(level_w), levels(f), other_level)

if (other_level %in% new_levels) {
f <- lvls_revalue(f, new_levels)
Expand All @@ -187,18 +191,15 @@ fct_lump_lowfreq <- function(f, other_level = "Other") {
}
}

check_calc_levels <- function(f, w = NULL, call = caller_env()) {
f <- check_factor(f)
compute_weights <- function(f, w = NULL, call = caller_env()) {
w <- check_weights(w, length(f), call = call)

if (is.null(w)) {
count <- as.vector(table(f))
total <- length(f)
} else {
count <- as.vector(tapply(w, f, FUN = sum))
total <- sum(w)
}
list(f = f, count = count, total = total)
w <- w %||% rep(1L, length(f))
n <- as.vector(tapply(w, f, sum))
# fill in counts for empty levels
n[is.na(n)] <- 0

n
}

# Lump together smallest groups, ensuring that the collective
Expand Down
8 changes: 2 additions & 6 deletions R/reorder.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,8 @@ fct_inorder <- function(f, ordered = NA) {
#' @inheritParams fct_lump
fct_infreq <- function(f, w = NULL, ordered = NA) {
f <- check_factor(f)

check_weights(w, length(f))
w <- w %||% rep(1L, length(f))

freq <- tapply(w, f, sum)
lvls_reorder(f, order(freq, decreasing = TRUE), ordered = ordered)
w <- compute_weights(f, w)
lvls_reorder(f, order(w, decreasing = TRUE), ordered = ordered)
}

#' @export
Expand Down
2 changes: 1 addition & 1 deletion man/fct_lump.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions tests/testthat/_snaps/lump.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
Code
fct_lump(letters, w = letters)
Condition
Error in `fct_lump()`:
Error in `fct_lump_lowfreq()`:
! `w` must be a numeric vector, not a string.
Code
fct_lump(letters, w = 1:10)
Condition
Error in `fct_lump()`:
Error in `fct_lump_lowfreq()`:
! `w` must be the same length as `f` (26), not length 10.
Code
fct_lump(letters, w = c(-1, rep(1, 24), -1))
Condition
Error in `fct_lump()`:
Error in `fct_lump_lowfreq()`:
! All `w` must be non-negative and non-missing.
2 problems at positions 1 and 26.

9 changes: 9 additions & 0 deletions tests/testthat/test-lump.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,15 @@ test_that("values are correctly weighted", {
)
})

test_that("can use weights with empty levels", {
f <- factor(c("a", "a", "b", "c"), levels = c("a", "b", "c", "d"))

expect_equal(
fct_lump_prop(f, prop = 0.25, w = rep(1, 4)),
fct(c("a", "a", "Other", "Other"))
)
})

test_that("do not change the label when no lumping occurs", {
f <- c("a", "a", "a", "a", "b", "b", "b", "c", "c", "d")
expect_equal(levels(fct_lump(f, n = 3)), c("a", "b", "c", "d"))
Expand Down

0 comments on commit ee63353

Please sign in to comment.