From ee633539989955ea9a214ef69cd8c70729c19942 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Wed, 4 Jan 2023 14:45:12 -0600 Subject: [PATCH] Refactor weight computation Fixes #292 --- NEWS.md | 3 ++ R/lump.R | 59 ++++++++++++++++++----------------- R/reorder.R | 8 ++--- man/fct_lump.Rd | 2 +- tests/testthat/_snaps/lump.md | 6 ++-- tests/testthat/test-lump.R | 9 ++++++ 6 files changed, 48 insertions(+), 39 deletions(-) diff --git a/NEWS.md b/NEWS.md index 26a7c7f5..d37b6feb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # forcats (development version) +* `fct_lump_prop()` and friends now work correctly if you supply weights + and have empty levels (#292). + * `fct_expand()` gains an `after` argument so that you can choose where the new levels are placed (#138). diff --git a/R/lump.R b/R/lump.R index 8c0eea83..90f2ff27 100644 --- a/R/lump.R +++ b/R/lump.R @@ -72,15 +72,13 @@ fct_lump <- function(f, n, prop, w = NULL, other_level = "Other", ties.method = c("min", "average", "first", "last", "random", "max")) { ties.method <- match.arg(ties.method) - check_calc_levels(f, w) - if (missing(n) && missing(prop)) { - fct_lump_lowfreq(f, other_level = other_level) + fct_lump_lowfreq(f, w = w,other_level = other_level) } else if (missing(prop)) { - fct_lump_n(f, n, w, other_level, ties.method) + fct_lump_n(f, n, w = w, other_level = other_level, ties.method = ties.method) } else if (missing(n)) { - fct_lump_prop(f, prop, w, other_level) + fct_lump_prop(f, prop, w = w, other_level = other_level) } else { cli::cli_abort("Must supply only one of {.arg n} and {.arg prop}") } @@ -89,14 +87,14 @@ fct_lump <- function(f, n, prop, w = NULL, other_level = "Other", #' @export #' @rdname fct_lump fct_lump_min <- function(f, min, w = NULL, other_level = "Other") { - calcs <- check_calc_levels(f, w) - f <- calcs$f + f <- check_factor(f) + level_w <- compute_weights(f, w) if (!is.numeric(min) || length(min) != 1 || min < 0) { cli::cli_abort("{.arg min} must be a positive number") } - new_levels <- ifelse(calcs$count >= min, levels(f), other_level) + new_levels <- ifelse(level_w >= min, levels(f), other_level) if (other_level %in% new_levels) { f <- lvls_revalue(f, new_levels) @@ -109,14 +107,20 @@ fct_lump_min <- function(f, min, w = NULL, other_level = "Other") { #' @export #' @rdname fct_lump fct_lump_prop <- function(f, prop, w = NULL, other_level = "Other") { - calcs <- check_calc_levels(f, w) - f <- calcs$f + f <- check_factor(f) + level_w <- compute_weights(f, w) if (!is.numeric(prop) || length(prop) != 1) { cli::cli_abort("{.arg prop} must be a number") } - prop_n <- calcs$count / calcs$total + # Compute proportion of total, including NAs + if (is.null(w)) { + prop_n <- level_w / length(f) + } else { + prop_n <- level_w / sum(w) + } + if (prop < 0) { new_levels <- ifelse(prop_n <= -prop, levels(f), other_level) } else { @@ -141,19 +145,19 @@ fct_lump_prop <- function(f, prop, w = NULL, other_level = "Other") { #' @rdname fct_lump fct_lump_n <- function(f, n, w = NULL, other_level = "Other", ties.method = c("min", "average", "first", "last", "random", "max")) { + f <- check_factor(f) ties.method <- match.arg(ties.method) - calcs <- check_calc_levels(f, w) - f <- calcs$f + level_w <- compute_weights(f, w) if (!is.numeric(n) || length(n) != 1) { cli::cli_abort("{.arg n} must be a number") } if (n < 0) { - rank <- rank(calcs$count, ties.method = ties.method) + rank <- rank(level_w, ties.method = ties.method) n <- -n } else { - rank <- rank(-calcs$count, ties.method = ties.method) + rank <- rank(-level_w, ties.method = ties.method) } new_levels <- ifelse(rank <= n, levels(f), other_level) @@ -173,11 +177,11 @@ fct_lump_n <- function(f, n, w = NULL, other_level = "Other", #' @export #' @rdname fct_lump -fct_lump_lowfreq <- function(f, other_level = "Other") { - calcs <- check_calc_levels(f, NULL) - f <- calcs$f +fct_lump_lowfreq <- function(f, w = NULL, other_level = "Other") { + f <- check_factor(f) + level_w <- compute_weights(f, w) - new_levels <- ifelse(!in_smallest(calcs$count), levels(f), other_level) + new_levels <- ifelse(!in_smallest(level_w), levels(f), other_level) if (other_level %in% new_levels) { f <- lvls_revalue(f, new_levels) @@ -187,18 +191,15 @@ fct_lump_lowfreq <- function(f, other_level = "Other") { } } -check_calc_levels <- function(f, w = NULL, call = caller_env()) { - f <- check_factor(f) +compute_weights <- function(f, w = NULL, call = caller_env()) { w <- check_weights(w, length(f), call = call) - if (is.null(w)) { - count <- as.vector(table(f)) - total <- length(f) - } else { - count <- as.vector(tapply(w, f, FUN = sum)) - total <- sum(w) - } - list(f = f, count = count, total = total) + w <- w %||% rep(1L, length(f)) + n <- as.vector(tapply(w, f, sum)) + # fill in counts for empty levels + n[is.na(n)] <- 0 + + n } # Lump together smallest groups, ensuring that the collective diff --git a/R/reorder.R b/R/reorder.R index 5a9b4216..4cfa9e9f 100644 --- a/R/reorder.R +++ b/R/reorder.R @@ -133,12 +133,8 @@ fct_inorder <- function(f, ordered = NA) { #' @inheritParams fct_lump fct_infreq <- function(f, w = NULL, ordered = NA) { f <- check_factor(f) - - check_weights(w, length(f)) - w <- w %||% rep(1L, length(f)) - - freq <- tapply(w, f, sum) - lvls_reorder(f, order(freq, decreasing = TRUE), ordered = ordered) + w <- compute_weights(f, w) + lvls_reorder(f, order(w, decreasing = TRUE), ordered = ordered) } #' @export diff --git a/man/fct_lump.Rd b/man/fct_lump.Rd index e89f260d..05aa5e7f 100644 --- a/man/fct_lump.Rd +++ b/man/fct_lump.Rd @@ -29,7 +29,7 @@ fct_lump_n( ties.method = c("min", "average", "first", "last", "random", "max") ) -fct_lump_lowfreq(f, other_level = "Other") +fct_lump_lowfreq(f, w = NULL, other_level = "Other") } \arguments{ \item{f}{A factor (or character vector).} diff --git a/tests/testthat/_snaps/lump.md b/tests/testthat/_snaps/lump.md index f706ac08..af28fa20 100644 --- a/tests/testthat/_snaps/lump.md +++ b/tests/testthat/_snaps/lump.md @@ -3,17 +3,17 @@ Code fct_lump(letters, w = letters) Condition - Error in `fct_lump()`: + Error in `fct_lump_lowfreq()`: ! `w` must be a numeric vector, not a string. Code fct_lump(letters, w = 1:10) Condition - Error in `fct_lump()`: + Error in `fct_lump_lowfreq()`: ! `w` must be the same length as `f` (26), not length 10. Code fct_lump(letters, w = c(-1, rep(1, 24), -1)) Condition - Error in `fct_lump()`: + Error in `fct_lump_lowfreq()`: ! All `w` must be non-negative and non-missing. 2 problems at positions 1 and 26. diff --git a/tests/testthat/test-lump.R b/tests/testthat/test-lump.R index 20498b0e..6c824b4a 100644 --- a/tests/testthat/test-lump.R +++ b/tests/testthat/test-lump.R @@ -126,6 +126,15 @@ test_that("values are correctly weighted", { ) }) +test_that("can use weights with empty levels", { + f <- factor(c("a", "a", "b", "c"), levels = c("a", "b", "c", "d")) + + expect_equal( + fct_lump_prop(f, prop = 0.25, w = rep(1, 4)), + fct(c("a", "a", "Other", "Other")) + ) +}) + test_that("do not change the label when no lumping occurs", { f <- c("a", "a", "a", "a", "b", "b", "b", "c", "c", "d") expect_equal(levels(fct_lump(f, n = 3)), c("a", "b", "c", "d"))