diff --git a/NEWS.md b/NEWS.md index e8cea03b..d7d1c3b1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,9 @@ * `as_factor()` and `fct_inorder()` accept NA levels (#98). +* `fct_lump()` gains `weights` argument (#70, @wilkox) to weight value + frequencies before lumping them together (#68). + # forcats 0.2.0 ## New functions diff --git a/R/lump.R b/R/lump.R index ddf953b5..e36e1392 100644 --- a/R/lump.R +++ b/R/lump.R @@ -14,6 +14,8 @@ #' Positive `prop` preserves values that appear at least #' `prop` of the time. Negative `prop` preserves values that #' appear at most `-prop` of the time. +#' @param weights An optional numeric vector giving weights for frequency of +#' each value (not level) in f. #' @param other_level Value of level used for "other" values. Always #' placed at end of levels. #' @param ties.method A character string specifying how ties are @@ -39,20 +41,36 @@ #' fct_lump(x, n = -3) #' fct_lump(x, prop = -0.1) #' +#' # Use weighted frequencies +#' w <- c(rep(2, 50), rep(1, 50)) +#' fct_lump(x, n = 5, weights = w) +#' #' # Use ties.method to control how tied factors are collapsed #' fct_lump(x, n = 6) #' fct_lump(x, n = 6, ties.method = "max") #' -fct_lump <- function(f, n, prop, other_level = "Other", +fct_lump <- function(f, n, prop, weights = NULL, other_level = "Other", ties.method = c("min", "average", "first", "last", "random", "max")) { f <- check_factor(f) ties.method <- match.arg(ties.method) + if (!is.null(weights)) { + if (!is.numeric(weights)) { + stop("`weights` must be a numeric vector", call. = FALSE) + } else if (length(f) != length(weights)) { + stop("Different lengths of `f` and `weights`", call. = FALSE) + } + } + levels <- levels(f) - count <- table(f) + if (is.null(weights)) { + count <- table(f) + } else { + count <- tapply(weights, f, FUN = sum) + } if (!xor(missing(n), missing(prop))) { - new_levels <- ifelse(!in_smallest(table(f)), levels, other_level) + new_levels <- ifelse(!in_smallest(count), levels, other_level) } else if (!missing(n)) { if (n < 0) { rank <- rank(count, ties = ties.method) diff --git a/man/fct_lump.Rd b/man/fct_lump.Rd index 70cd333e..7835e2ac 100644 --- a/man/fct_lump.Rd +++ b/man/fct_lump.Rd @@ -4,8 +4,8 @@ \alias{fct_lump} \title{Lump together least/most common factor levels into "other"} \usage{ -fct_lump(f, n, prop, other_level = "Other", ties.method = c("min", - "average", "first", "last", "random", "max")) +fct_lump(f, n, prop, weights = NULL, other_level = "Other", + ties.method = c("min", "average", "first", "last", "random", "max")) } \arguments{ \item{f}{A factor.} @@ -23,6 +23,9 @@ Positive \code{prop} preserves values that appear at least \code{prop} of the time. Negative \code{prop} preserves values that appear at most \code{-prop} of the time.} +\item{weights}{An optional numeric vector giving weights for frequency of +each value (not level) in f.} + \item{other_level}{Value of level used for "other" values. Always placed at end of levels.} @@ -51,6 +54,10 @@ fct_lump(x, prop = 0.1) fct_lump(x, n = -3) fct_lump(x, prop = -0.1) +# Use weighted frequencies +w <- c(rep(2, 50), rep(1, 50)) +fct_lump(x, n = 5, weights = w) + # Use ties.method to control how tied factors are collapsed fct_lump(x, n = 6) fct_lump(x, n = 6, ties.method = "max") diff --git a/tests/testthat/test-fct_lump.R b/tests/testthat/test-fct_lump.R index 576dcade..c234923a 100644 --- a/tests/testthat/test-fct_lump.R +++ b/tests/testthat/test-fct_lump.R @@ -55,6 +55,63 @@ test_that("different behaviour when apply tie function", { } }) +test_that("bad weights generate error messages", { + f <- c("a", "a", "a", "b", "b", "c", "d", "e", "f", "g") + + w <- c( + "one", + "three", + "four", + "five", + "nine", + "twenty", + "three", + "seven", + "twelve", + "two" + ) + expect_error(fct_lump(f, n = 2, weight = w), "must be a numeric vector") + + w <- c(1, 2, 3, 4, 5, 6, 7, 8, 9) + expect_error(fct_lump(f, n = 4, weight = w), "Different lengths") +}) + +test_that("values are correctly weighted", { + f <- c("a", "a", "a", "b", "b", "c", "d", "e", "f", "g") + w <- c( 0.2, 0.2, 0.6, 2, 2, 6, 4, 2, 2, 1) + f2 <- c( + "a", + rep("b", 4), + rep("c", 6), + rep("d", 4), + rep("e", 2), + rep("f", 2), + "g" + ) + + expect_equal(levels(fct_lump(f, weights = w)), levels(fct_lump(f2))) + expect_equal( + levels(fct_lump(f, n = 1, weights = w)), + levels(fct_lump(f2, n = 1)) + ) + expect_equal( + levels(fct_lump(f, n = -2, weights = w, ties.method = "first")), + levels(fct_lump(f2, n = -2, ties.method = "first")) + ) + expect_equal( + levels(fct_lump(f, n = 99, weights = w)), + levels(fct_lump(f2, n = 99)) + ) + expect_equal( + levels(fct_lump(f, prop = 0.01, weights = w)), + levels(fct_lump(f2, prop = 0.01)) + ) + expect_equal( + levels(fct_lump(f, prop = -0.25, weights = w, ties.method = "max")), + levels(fct_lump(f2, prop = -0.25, ties.method = "max")) + ) +}) + # Default ----------------------------------------------------------------- test_that("lumps smallest", {