From 4255b0f1b0dc478acac8b79e81fa44ea11e87e3f Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 21 Feb 2023 16:04:51 -0500 Subject: [PATCH 1/3] Implement `relationship` for `vec_locate_matches()` And silently soft-deprecate `multiple = "error"` and `multiple = "warning"` --- NAMESPACE | 6 + R/match.R | 214 ++++++++++++++-- man/vec_locate_matches.Rd | 57 ++++- src/decl/match-decl.h | 33 +++ src/init.c | 4 +- src/match.c | 429 +++++++++++++++++++++++++++++---- src/utils.c | 12 + src/utils.h | 6 + tests/testthat/_snaps/match.md | 260 +++++++++++++++++++- tests/testthat/test-match.R | 339 +++++++++++++++++++++++--- 10 files changed, 1240 insertions(+), 120 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4d020d8de..f4350d243 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -95,6 +95,9 @@ S3method(cnd_body,vctrs_error_incompatible_size) S3method(cnd_body,vctrs_error_matches_incomplete) S3method(cnd_body,vctrs_error_matches_multiple) S3method(cnd_body,vctrs_error_matches_nothing) +S3method(cnd_body,vctrs_error_matches_relationship_many_to_one) +S3method(cnd_body,vctrs_error_matches_relationship_one_to_many) +S3method(cnd_body,vctrs_error_matches_relationship_one_to_one) S3method(cnd_body,vctrs_error_matches_remaining) S3method(cnd_body,vctrs_error_names_cannot_be_dot_dot) S3method(cnd_body,vctrs_error_names_cannot_be_empty) @@ -106,6 +109,9 @@ S3method(cnd_header,vctrs_error_incompatible_size) S3method(cnd_header,vctrs_error_matches_incomplete) S3method(cnd_header,vctrs_error_matches_multiple) S3method(cnd_header,vctrs_error_matches_nothing) +S3method(cnd_header,vctrs_error_matches_relationship_many_to_one) +S3method(cnd_header,vctrs_error_matches_relationship_one_to_many) +S3method(cnd_header,vctrs_error_matches_relationship_one_to_one) S3method(cnd_header,vctrs_error_matches_remaining) S3method(cnd_header,vctrs_error_names_cannot_be_dot_dot) S3method(cnd_header,vctrs_error_names_cannot_be_empty) diff --git a/R/match.R b/R/match.R index 19142bd42..a51a2ad57 100644 --- a/R/match.R +++ b/R/match.R @@ -108,9 +108,43 @@ #' `"last"` if you just need to detect if there is at least one match. #' - `"first"` returns the first match detected in `haystack`. #' - `"last"` returns the last match detected in `haystack`. -#' - `"warning"` throws a warning if multiple matches are detected, but -#' otherwise falls back to `"all"`. -#' - `"error"` throws an error if multiple matches are detected. +#' +#' @param relationship Handling of the expected relationship between +#' `needles` and `haystack`. If the expectations chosen from the list below +#' are invalidated, an error is thrown. +#' +#' - `"none"` doesn't perform any relationship checks. +#' +#' - `"one_to_one"` expects: +#' - Each value in `needles` matches at most 1 value in `haystack`. +#' - Each value in `haystack` matches at most 1 value in `needles`. +#' +#' - `"one_to_many"` expects: +#' - Each value in `needles` matches any number of values in `haystack`. +#' - Each value in `haystack` matches at most 1 value in `needles`. +#' +#' - `"many_to_one"` expects: +#' - Each value in `needles` matches at most 1 value in `haystack`. +#' - Each value in `haystack` matches any number of values in `needles`. +#' +#' - `"many_to_many"` expects: +#' - Each value in `needles` matches any number of values in `haystack`. +#' - Each value in `haystack` matches any number of values in `needles`. +#' +#' This performs no checks, and is identical to `"none"`, but is provided to +#' allow you to be explicit about this relationship if you know it exists. +#' +#' - `"warn_many_to_many"` doesn't assume there is any known relationship, but +#' will warn if `needles` and `haystack` have a many-to-many relationship +#' (which is typically unexpected), encouraging you to either take a closer +#' look at your inputs or make this relationship explicit by specifying +#' `"many_to_many"`. +#' +#' `relationship` is applied after `filter` and `multiple` to allow potential +#' multiple matches to be filtered out first. +#' +#' `relationship` doesn't handle cases where there are zero matches. For that, +#' see `no_match` and `remaining`. #' #' @param needles_arg,haystack_arg Argument tags for `needles` and `haystack` #' used in error messages. @@ -141,7 +175,13 @@ #' vec_locate_matches(x, y, multiple = "first") #' vec_locate_matches(x, y, multiple = "last") #' vec_locate_matches(x, y, multiple = "any") -#' try(vec_locate_matches(x, y, multiple = "error")) +#' +#' # Use `relationship` to add constraints and error on multiple matches if +#' # they aren't expected +#' try(vec_locate_matches(x, y, relationship = "one_to_one")) +#' +#' # In this case, the `NA` in `y` matches two rows in `x` +#' try(vec_locate_matches(x, y, relationship = "one_to_many")) #' #' # By default, NA is treated as being identical to NaN. #' # Using `nan_distinct = TRUE` treats NA and NaN as different values, so NA @@ -153,6 +193,10 @@ #' # in `needles`. #' vec_locate_matches(x, y, incomplete = NA) #' +#' # Using `incomplete = NA` allows us to enforce the one-to-many relationship +#' # that we couldn't before +#' vec_locate_matches(x, y, relationship = "one_to_many", incomplete = NA) +#' #' # `no_match` allows you to specify the returned value for a needle with #' # zero matches. Note that this is different from an incomplete value, #' # so specifying `no_match` allows you to differentiate between incomplete @@ -239,6 +283,7 @@ vec_locate_matches <- function(needles, no_match = NA_integer_, remaining = "drop", multiple = "all", + relationship = "none", nan_distinct = FALSE, chr_proxy_collate = NULL, needles_arg = "", @@ -257,6 +302,7 @@ vec_locate_matches <- function(needles, no_match, remaining, multiple, + relationship, nan_distinct, chr_proxy_collate, needles_arg, @@ -419,47 +465,173 @@ stop_matches_multiple <- function(i, needles_arg, haystack_arg, call) { cnd_header.vctrs_error_matches_multiple <- function(cnd, ...) { cnd_matches_multiple_header(cnd$needles_arg, cnd$haystack_arg) } -cnd_matches_multiple_header <- function(needles_arg, haystack_arg) { - if (nzchar(needles_arg)) { - needles_name <- glue::glue(" of `{needles_arg}` ") + +#' @export +cnd_body.vctrs_error_matches_multiple <- function(cnd, ...) { + cnd_matches_multiple_body(cnd$i) +} + +# ------------------------------------------------------------------------------ + +warn_matches_multiple <- function(i, needles_arg, haystack_arg, call) { + message <- paste( + cnd_matches_multiple_header(needles_arg, haystack_arg), + cnd_matches_multiple_body(i), + sep = "\n" + ) + + warn_matches( + message = message, + class = "vctrs_warning_matches_multiple", + i = i, + needles_arg = needles_arg, + haystack_arg = haystack_arg, + call = call + ) +} + +# ------------------------------------------------------------------------------ + +stop_matches_relationship_one_to_one <- function(i, which, needles_arg, haystack_arg, call) { + stop_matches_relationship( + class = "vctrs_error_matches_relationship_one_to_one", + i = i, + which = which, + needles_arg = needles_arg, + haystack_arg = haystack_arg, + call = call + ) +} + +#' @export +cnd_header.vctrs_error_matches_relationship_one_to_one <- function(cnd, ...) { + if (cnd$which == "needles") { + cnd_matches_multiple_header(cnd$needles_arg, cnd$haystack_arg) } else { - needles_name <- " " + cnd_matches_multiple_header(cnd$haystack_arg, cnd$needles_arg) } +} - if (nzchar(haystack_arg)) { - haystack_name <- glue::glue(" from `{haystack_arg}`") +#' @export +cnd_body.vctrs_error_matches_relationship_one_to_one <- function(cnd, ...) { + if (cnd$which == "needles") { + cnd_matches_multiple_body(cnd$i, cnd$needles_arg) } else { - haystack_name <- "" + cnd_matches_multiple_body(cnd$i, cnd$haystack_arg) } +} + - glue::glue("Each element{needles_name}can match at most 1 observation{haystack_name}.") +stop_matches_relationship_one_to_many <- function(i, needles_arg, haystack_arg, call) { + stop_matches_relationship( + class = "vctrs_error_matches_relationship_one_to_many", + i = i, + needles_arg = needles_arg, + haystack_arg = haystack_arg, + call = call + ) } #' @export -cnd_body.vctrs_error_matches_multiple <- function(cnd, ...) { - cnd_matches_multiple_body(cnd$i) +cnd_header.vctrs_error_matches_relationship_one_to_many <- function(cnd, ...) { + cnd_matches_multiple_header(cnd$haystack_arg, cnd$needles_arg) +} + +#' @export +cnd_body.vctrs_error_matches_relationship_one_to_many <- function(cnd, ...) { + cnd_matches_multiple_body(cnd$i, cnd$haystack_arg) +} + + +stop_matches_relationship_many_to_one <- function(i, needles_arg, haystack_arg, call) { + stop_matches_relationship( + class = "vctrs_error_matches_relationship_many_to_one", + i = i, + needles_arg = needles_arg, + haystack_arg = haystack_arg, + call = call + ) +} + +#' @export +cnd_header.vctrs_error_matches_relationship_many_to_one <- function(cnd, ...) { + cnd_matches_multiple_header(cnd$needles_arg, cnd$haystack_arg) +} + +#' @export +cnd_body.vctrs_error_matches_relationship_many_to_one <- function(cnd, ...) { + cnd_matches_multiple_body(cnd$i, cnd$needles_arg) +} + + +stop_matches_relationship <- function(class = NULL, ..., call = caller_env()) { + stop_matches( + class = c(class, "vctrs_error_matches_relationship"), + ..., + call = call + ) +} + +cnd_matches_multiple_header <- function(x_arg, y_arg) { + if (nzchar(x_arg)) { + x_name <- glue::glue(" of `{x_arg}` ") + } else { + x_name <- " " + } + + if (nzchar(y_arg)) { + y_name <- glue::glue(" from `{y_arg}`") + } else { + y_name <- "" + } + + glue::glue("Each element{x_name}can match at most 1 observation{y_name}.") } -cnd_matches_multiple_body <- function(i) { - bullet <- glue::glue("The element at location {i} has multiple matches.") + +cnd_matches_multiple_body <- function(i, name = "") { + if (nzchar(name)) { + bullet <- glue::glue("The element of `{name}` at location {i} has multiple matches.") + } else { + bullet <- glue::glue("The element at location {i} has multiple matches.") + } bullet <- c(x = bullet) format_error_bullets(bullet) } # ------------------------------------------------------------------------------ -warn_matches_multiple <- function(i, needles_arg, haystack_arg, call) { +warn_matches_relationship_many_to_many <- function(i, j, needles_arg, haystack_arg, call) { + if (nzchar(needles_arg) && nzchar(haystack_arg)) { + name_needles_and_haystack <- glue::glue(" between `{needles_arg}` and `{haystack_arg}`") + } else { + name_needles_and_haystack <- "" + } + + header <- glue::glue("Detected an unexpected many-to-many relationship{name_needles_and_haystack}.") + message <- paste( - cnd_matches_multiple_header(needles_arg, haystack_arg), - cnd_matches_multiple_body(i), + header, + cnd_matches_multiple_body(i, needles_arg), + cnd_matches_multiple_body(j, haystack_arg), sep = "\n" ) - warn_matches( + warn_matches_relationship( message = message, - class = "vctrs_warning_matches_multiple", + class = "vctrs_warning_matches_relationship_many_to_many", i = i, + j = j, needles_arg = needles_arg, haystack_arg = haystack_arg, call = call ) } + +warn_matches_relationship <- function(message, class = NULL, ..., call = caller_env()) { + warn_matches( + message = message, + class = c(class, "vctrs_warning_matches_relationship"), + ..., + call = call + ) +} diff --git a/man/vec_locate_matches.Rd b/man/vec_locate_matches.Rd index 95108cf00..1a0d8d19b 100644 --- a/man/vec_locate_matches.Rd +++ b/man/vec_locate_matches.Rd @@ -14,6 +14,7 @@ vec_locate_matches( no_match = NA_integer_, remaining = "drop", multiple = "all", + relationship = "none", nan_distinct = FALSE, chr_proxy_collate = NULL, needles_arg = "", @@ -108,11 +109,49 @@ which match will be returned. It is often faster than \code{"first"} and \code{"last"} if you just need to detect if there is at least one match. \item \code{"first"} returns the first match detected in \code{haystack}. \item \code{"last"} returns the last match detected in \code{haystack}. -\item \code{"warning"} throws a warning if multiple matches are detected, but -otherwise falls back to \code{"all"}. -\item \code{"error"} throws an error if multiple matches are detected. }} +\item{relationship}{Handling of the expected relationship between +\code{needles} and \code{haystack}. If the expectations chosen from the list below +are invalidated, an error is thrown. +\itemize{ +\item \code{"none"} doesn't perform any relationship checks. +\item \code{"one_to_one"} expects: +\itemize{ +\item Each value in \code{needles} matches at most 1 value in \code{haystack}. +\item Each value in \code{haystack} matches at most 1 value in \code{needles}. +} +\item \code{"one_to_many"} expects: +\itemize{ +\item Each value in \code{needles} matches any number of values in \code{haystack}. +\item Each value in \code{haystack} matches at most 1 value in \code{needles}. +} +\item \code{"many_to_one"} expects: +\itemize{ +\item Each value in \code{needles} matches at most 1 value in \code{haystack}. +\item Each value in \code{haystack} matches any number of values in \code{needles}. +} +\item \code{"many_to_many"} expects: +\itemize{ +\item Each value in \code{needles} matches any number of values in \code{haystack}. +\item Each value in \code{haystack} matches any number of values in \code{needles}. +} + +This performs no checks, and is identical to \code{"none"}, but is provided to +allow you to be explicit about this relationship if you know it exists. +\item \code{"warn_many_to_many"} doesn't assume there is any known relationship, but +will warn if \code{needles} and \code{haystack} have a many-to-many relationship +(which is typically unexpected), encouraging you to either take a closer +look at your inputs or make this relationship explicit by specifying +\code{"many_to_many"}. +} + +\code{relationship} is applied after \code{filter} and \code{multiple} to allow potential +multiple matches to be filtered out first. + +\code{relationship} doesn't handle cases where there are zero matches. For that, +see \code{no_match} and \code{remaining}.} + \item{nan_distinct}{A single logical specifying whether or not \code{NaN} should be considered distinct from \code{NA} for double and complex vectors. If \code{TRUE}, \code{NaN} will always be ordered between \code{NA} and non-missing numbers.} @@ -207,7 +246,13 @@ data_frame( vec_locate_matches(x, y, multiple = "first") vec_locate_matches(x, y, multiple = "last") vec_locate_matches(x, y, multiple = "any") -try(vec_locate_matches(x, y, multiple = "error")) + +# Use `relationship` to add constraints and error on multiple matches if +# they aren't expected +try(vec_locate_matches(x, y, relationship = "one_to_one")) + +# In this case, the `NA` in `y` matches two rows in `x` +try(vec_locate_matches(x, y, relationship = "one_to_many")) # By default, NA is treated as being identical to NaN. # Using `nan_distinct = TRUE` treats NA and NaN as different values, so NA @@ -219,6 +264,10 @@ vec_locate_matches(x, y, nan_distinct = TRUE) # in `needles`. vec_locate_matches(x, y, incomplete = NA) +# Using `incomplete = NA` allows us to enforce the one-to-many relationship +# that we couldn't before +vec_locate_matches(x, y, relationship = "one_to_many", incomplete = NA) + # `no_match` allows you to specify the returned value for a needle with # zero matches. Note that this is different from an incomplete value, # so specifying `no_match` allows you to differentiate between incomplete diff --git a/src/decl/match-decl.h b/src/decl/match-decl.h index d7b4b03e3..e2e12868e 100644 --- a/src/decl/match-decl.h +++ b/src/decl/match-decl.h @@ -18,6 +18,7 @@ r_obj* vec_locate_matches(r_obj* needles, const struct vctrs_no_match* no_match, const struct vctrs_remaining* remaining, enum vctrs_multiple multiple, + enum vctrs_relationship relationship, bool nan_distinct, r_obj* chr_proxy_collate, struct vctrs_arg* needles_arg, @@ -35,6 +36,7 @@ r_obj* df_locate_matches(r_obj* needles, const struct vctrs_no_match* no_match, const struct vctrs_remaining* remaining, enum vctrs_multiple multiple, + enum vctrs_relationship relationship, bool any_filters, const enum vctrs_filter* v_filters, const enum vctrs_ops* v_ops, @@ -144,6 +146,10 @@ static inline enum vctrs_multiple parse_multiple(r_obj* multiple, struct r_lazy call); +static inline +enum vctrs_relationship parse_relationship(r_obj* relationship, + struct r_lazy call); + static inline void parse_filter(r_obj* filter, r_ssize n_cols, enum vctrs_filter* v_filters); @@ -158,6 +164,7 @@ r_obj* expand_compact_indices(const int* v_o_haystack, const struct vctrs_no_match* no_match, const struct vctrs_remaining* remaining, enum vctrs_multiple multiple, + enum vctrs_relationship relationship, r_ssize size_needles, r_ssize size_haystack, bool any_non_equi, @@ -237,3 +244,29 @@ void warn_matches_multiple(r_ssize i, struct vctrs_arg* needles_arg, struct vctrs_arg* haystack_arg, struct r_lazy call); + +static inline +void stop_matches_relationship_one_to_one(r_ssize i, + const char* which, + struct vctrs_arg* needles_arg, + struct vctrs_arg* haystack_arg, + struct r_lazy call); + +static inline +void stop_matches_relationship_one_to_many(r_ssize i, + struct vctrs_arg* needles_arg, + struct vctrs_arg* haystack_arg, + struct r_lazy call); + +static inline +void stop_matches_relationship_many_to_one(r_ssize i, + struct vctrs_arg* needles_arg, + struct vctrs_arg* haystack_arg, + struct r_lazy call); + +static inline +void warn_matches_relationship_many_to_many(r_ssize i, + r_ssize j, + struct vctrs_arg* needles_arg, + struct vctrs_arg* haystack_arg, + struct r_lazy call); diff --git a/src/init.c b/src/init.c index 6923f608e..75cb9505f 100644 --- a/src/init.c +++ b/src/init.c @@ -143,7 +143,7 @@ extern r_obj* vctrs_list_drop_empty(r_obj*); extern r_obj* vctrs_is_altrep(r_obj* x); extern r_obj* ffi_interleave_indices(r_obj*, r_obj*); extern r_obj* ffi_compute_nesting_container_info(r_obj*, r_obj*); -extern r_obj* ffi_locate_matches(r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*); +extern r_obj* ffi_locate_matches(r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*, r_obj*); extern r_obj* ffi_interval_groups(r_obj*, r_obj*, r_obj*, r_obj*); extern r_obj* ffi_interval_locate_groups(r_obj*, r_obj*, r_obj*, r_obj*); extern r_obj* ffi_interval_complement(r_obj*, r_obj*, r_obj*, r_obj*); @@ -326,7 +326,7 @@ static const R_CallMethodDef CallEntries[] = { {"vctrs_is_altrep", (DL_FUNC) &vctrs_is_altrep, 1}, {"ffi_interleave_indices", (DL_FUNC) &ffi_interleave_indices, 2}, {"ffi_compute_nesting_container_info", (DL_FUNC) &ffi_compute_nesting_container_info, 2}, - {"ffi_locate_matches", (DL_FUNC) &ffi_locate_matches, 13}, + {"ffi_locate_matches", (DL_FUNC) &ffi_locate_matches, 14}, {"ffi_interval_groups", (DL_FUNC) &ffi_interval_groups, 4}, {"ffi_interval_locate_groups", (DL_FUNC) &ffi_interval_locate_groups, 4}, {"ffi_interval_complement", (DL_FUNC) &ffi_interval_complement, 4}, diff --git a/src/match.c b/src/match.c index dabea1b62..c88e29030 100644 --- a/src/match.c +++ b/src/match.c @@ -4,11 +4,22 @@ enum vctrs_multiple { VCTRS_MULTIPLE_all = 0, - VCTRS_MULTIPLE_warning = 1, - VCTRS_MULTIPLE_error = 2, - VCTRS_MULTIPLE_first = 3, - VCTRS_MULTIPLE_last = 4, - VCTRS_MULTIPLE_any = 5 + VCTRS_MULTIPLE_any = 1, + VCTRS_MULTIPLE_first = 2, + VCTRS_MULTIPLE_last = 3, + + // Deprecated in favor of `relationship` + VCTRS_MULTIPLE_warning = 4, + VCTRS_MULTIPLE_error = 5 +}; + +enum vctrs_relationship { + VCTRS_RELATIONSHIP_none = 0, + VCTRS_RELATIONSHIP_one_to_one = 1, + VCTRS_RELATIONSHIP_one_to_many = 2, + VCTRS_RELATIONSHIP_many_to_one = 3, + VCTRS_RELATIONSHIP_many_to_many = 4, + VCTRS_RELATIONSHIP_warn_many_to_many = 5 }; enum vctrs_filter { @@ -80,6 +91,7 @@ r_obj* ffi_locate_matches(r_obj* needles, r_obj* no_match, r_obj* remaining, r_obj* multiple, + r_obj* relationship, r_obj* nan_distinct, r_obj* chr_proxy_collate, r_obj* needles_arg, @@ -92,6 +104,7 @@ r_obj* ffi_locate_matches(r_obj* needles, const struct vctrs_no_match c_no_match = parse_no_match(no_match, internal_call); const struct vctrs_remaining c_remaining = parse_remaining(remaining, internal_call); const enum vctrs_multiple c_multiple = parse_multiple(multiple, internal_call); + const enum vctrs_relationship c_relationship = parse_relationship(relationship, internal_call); const bool c_nan_distinct = r_arg_as_bool(nan_distinct, "nan_distinct"); struct vctrs_arg c_needles_arg = vec_as_arg(needles_arg); @@ -106,6 +119,7 @@ r_obj* ffi_locate_matches(r_obj* needles, &c_no_match, &c_remaining, c_multiple, + c_relationship, c_nan_distinct, chr_proxy_collate, &c_needles_arg, @@ -123,6 +137,7 @@ r_obj* vec_locate_matches(r_obj* needles, const struct vctrs_no_match* no_match, const struct vctrs_remaining* remaining, enum vctrs_multiple multiple, + enum vctrs_relationship relationship, bool nan_distinct, r_obj* chr_proxy_collate, struct vctrs_arg* needles_arg, @@ -228,6 +243,7 @@ r_obj* vec_locate_matches(r_obj* needles, no_match, remaining, multiple, + relationship, any_filters, v_filters, v_ops, @@ -253,6 +269,7 @@ r_obj* df_locate_matches(r_obj* needles, const struct vctrs_no_match* no_match, const struct vctrs_remaining* remaining, enum vctrs_multiple multiple, + enum vctrs_relationship relationship, bool any_filters, const enum vctrs_filter* v_filters, const enum vctrs_ops* v_ops, @@ -448,6 +465,7 @@ r_obj* df_locate_matches(r_obj* needles, no_match, remaining, multiple, + relationship, size_needles, size_haystack, any_non_equi, @@ -1365,12 +1383,36 @@ enum vctrs_multiple parse_multiple(r_obj* multiple, struct r_lazy call) { if (!strcmp(c_multiple, "any")) return VCTRS_MULTIPLE_any; if (!strcmp(c_multiple, "first")) return VCTRS_MULTIPLE_first; if (!strcmp(c_multiple, "last")) return VCTRS_MULTIPLE_last; + // TODO: Remove deprecated support for `multiple = "error"/"warning"` if (!strcmp(c_multiple, "warning")) return VCTRS_MULTIPLE_warning; if (!strcmp(c_multiple, "error")) return VCTRS_MULTIPLE_error; r_abort_lazy_call( call, - "`multiple` must be one of \"all\", \"any\", \"first\", \"last\", \"warning\", or \"error\"." + "`multiple` must be one of \"all\", \"any\", \"first\", or \"last\"." + ); +} + +// ----------------------------------------------------------------------------- + +static inline +enum vctrs_relationship parse_relationship(r_obj* relationship, struct r_lazy call) { + if (!r_is_string(relationship)) { + r_abort_lazy_call(call, "`relationship` must be a string."); + } + + const char* c_relationship = r_chr_get_c_string(relationship, 0); + + if (!strcmp(c_relationship, "none")) return VCTRS_RELATIONSHIP_none; + if (!strcmp(c_relationship, "one_to_one")) return VCTRS_RELATIONSHIP_one_to_one; + if (!strcmp(c_relationship, "one_to_many")) return VCTRS_RELATIONSHIP_one_to_many; + if (!strcmp(c_relationship, "many_to_one")) return VCTRS_RELATIONSHIP_many_to_one; + if (!strcmp(c_relationship, "many_to_many")) return VCTRS_RELATIONSHIP_many_to_many; + if (!strcmp(c_relationship, "warn_many_to_many")) return VCTRS_RELATIONSHIP_warn_many_to_many; + + r_abort_lazy_call( + call, + "`relationship` must be one of \"none\", \"one_to_one\", \"one_to_many\", \"many_to_one\", \"many_to_many\", or \"warn_many_to_many\"." ); } @@ -1556,6 +1598,7 @@ r_obj* expand_compact_indices(const int* v_o_haystack, const struct vctrs_no_match* no_match, const struct vctrs_remaining* remaining, enum vctrs_multiple multiple, + enum vctrs_relationship relationship, r_ssize size_needles, r_ssize size_haystack, bool any_non_equi, @@ -1626,25 +1669,59 @@ r_obj* expand_compact_indices(const int* v_o_haystack, v_o_loc_needles = r_int_cbegin(o_loc_needles); } - const bool retain_remaining_haystack = (remaining->action != VCTRS_REMAINING_ACTION_drop); - int* v_detect_remaining_haystack = NULL; - if (retain_remaining_haystack) { - r_obj* detect_remaining_haystack = KEEP_N(r_alloc_integer(size_haystack), &n_prot); - v_detect_remaining_haystack = r_int_begin(detect_remaining_haystack); + bool any_multiple_needles = false; + bool any_multiple_haystack = false; - for (r_ssize i = 0; i < size_haystack; ++i) { - // Initialize to "remaining" (i.e. this haystack value wasn't matched) - v_detect_remaining_haystack[i] = 1; - } - } + // For `relationship = "warn_many_to_many"` + r_ssize loc_first_multiple_needles = -1; + r_ssize loc_first_multiple_haystack = -1; - bool any_multiple = false; - bool check_for_multiple = + // Check is always needed for `multiple = "all"`. + // This also handles `relationship` options too, since if `multiple` is + // `"any"`, `"first"`, or `"last"`, we can't invalidate a `relationship`. + bool check_multiple_needles = multiple == VCTRS_MULTIPLE_all || + // TODO: Remove deprecated support for `multiple = "error"/"warning"` multiple == VCTRS_MULTIPLE_error || multiple == VCTRS_MULTIPLE_warning; - // For multiple = "first" / "last" + bool check_multiple_haystack = false; + switch (relationship) { + // Expecting `haystack` can match any number of `needles` + case VCTRS_RELATIONSHIP_none: + case VCTRS_RELATIONSHIP_many_to_one: + case VCTRS_RELATIONSHIP_many_to_many: { + check_multiple_haystack = false; + break; + } + // Expecting `haystack` to match at most 1 `needles` + case VCTRS_RELATIONSHIP_one_to_one: + case VCTRS_RELATIONSHIP_one_to_many: { + check_multiple_haystack = true; + break; + } + // Only check for multiple matches in `haystack` if we are also checking + // for them in `needles`. Otherwise we can't possibly have a many-to-many + // issue so there is no need to check for one. + case VCTRS_RELATIONSHIP_warn_many_to_many: { + check_multiple_haystack = check_multiple_needles; + break; + } + } + + const bool retain_remaining_haystack = + remaining->action == VCTRS_REMAINING_ACTION_value || + remaining->action == VCTRS_REMAINING_ACTION_error; + + bool track_matches_haystack = check_multiple_haystack || retain_remaining_haystack; + bool* v_detect_matches_haystack = NULL; + if (track_matches_haystack) { + r_obj* detect_matches_haystack = KEEP_N(r_alloc_raw(size_haystack * sizeof(bool)), &n_prot); + v_detect_matches_haystack = r_raw_begin(detect_matches_haystack); + memset(v_detect_matches_haystack, 0, size_haystack * sizeof(bool)); + } + + // For `multiple = "first" / "last"` r_ssize loc_haystack_overall = r_globals.na_int; r_ssize loc_out = 0; @@ -1747,23 +1824,74 @@ r_obj* expand_compact_indices(const int* v_o_haystack, } } - if (check_for_multiple) { + if (check_multiple_needles) { if (loc < size_needles) { - any_multiple = size_match > 1; + any_multiple_needles = size_match > 1; } else { // Guaranteed second match if in the "extra" matches section - any_multiple = true; + any_multiple_needles = true; } - if (any_multiple) { - if (multiple == VCTRS_MULTIPLE_error) { - stop_matches_multiple(loc_needles, needles_arg, haystack_arg, error_call); - } else if (multiple == VCTRS_MULTIPLE_warning) { - warn_matches_multiple(loc_needles, needles_arg, haystack_arg, error_call); + if (any_multiple_needles) { + loc_first_multiple_needles = loc_needles; + + switch (relationship) { + case VCTRS_RELATIONSHIP_one_to_one: + stop_matches_relationship_one_to_one( + loc_first_multiple_needles, + "needles", + needles_arg, + haystack_arg, + error_call + ); + case VCTRS_RELATIONSHIP_many_to_one: + stop_matches_relationship_many_to_one( + loc_first_multiple_needles, + needles_arg, + haystack_arg, + error_call + ); + case VCTRS_RELATIONSHIP_warn_many_to_many: { + if (any_multiple_haystack) { + warn_matches_relationship_many_to_many( + loc_first_multiple_needles, + loc_first_multiple_haystack, + needles_arg, + haystack_arg, + error_call + ); + } + break; + } + default: { + switch (multiple) { + case VCTRS_MULTIPLE_all: + break; + // TODO: Remove deprecated support for `multiple = "error"/"warning"` + case VCTRS_MULTIPLE_error: + stop_matches_multiple( + loc_first_multiple_needles, + needles_arg, + haystack_arg, + error_call + ); + case VCTRS_MULTIPLE_warning: { + warn_matches_multiple( + loc_first_multiple_needles, + needles_arg, + haystack_arg, + error_call + ); + break; + } + default: + r_stop_internal("`check_multiple_needles` should have been false."); + } + } } // We know there are multiple and don't need to continue checking - check_for_multiple = false; + check_multiple_needles = false; } } @@ -1829,9 +1957,44 @@ r_obj* expand_compact_indices(const int* v_o_haystack, v_out_needles[loc_out] = loc_needles + 1; v_out_haystack[loc_out] = loc_haystack_overall + 1; - if (retain_remaining_haystack) { - // This haystack value was a match, so it isn't "remaining" - v_detect_remaining_haystack[loc_haystack_overall] = 0; + if (track_matches_haystack) { + if (check_multiple_haystack) { + // `true` if a match already existed + any_multiple_haystack = v_detect_matches_haystack[loc_haystack_overall]; + + if (any_multiple_haystack) { + loc_first_multiple_haystack = loc_haystack_overall; + + switch (relationship) { + case VCTRS_RELATIONSHIP_one_to_one: + stop_matches_relationship_one_to_one( + loc_first_multiple_haystack, + "haystack", + needles_arg, + haystack_arg, + error_call + ); + case VCTRS_RELATIONSHIP_one_to_many: + stop_matches_relationship_one_to_many( + loc_first_multiple_haystack, + needles_arg, + haystack_arg, + error_call + ); + case VCTRS_RELATIONSHIP_warn_many_to_many: + r_stop_internal( + "`relationship = 'warn_many_to_many'` with " + "`multiple = 'first'/'last' should have resulted in " + "`check_multiple_haystack = false`." + ); + default: + r_stop_internal("`check_multiple_haystack` should have been false."); + } + } + } + + // This haystack value was a match, so it isn't "remaining". + v_detect_matches_haystack[loc_haystack_overall] = true; } ++loc_out; @@ -1850,9 +2013,57 @@ r_obj* expand_compact_indices(const int* v_o_haystack, v_out_needles[loc_out] = loc_needles + 1; v_out_haystack[loc_out] = loc_haystack + 1; - if (retain_remaining_haystack) { - // This haystack value was a match, so it isn't "remaining" - v_detect_remaining_haystack[loc_haystack] = 0; + if (track_matches_haystack) { + if (check_multiple_haystack) { + // `true` if a match already existed + any_multiple_haystack = v_detect_matches_haystack[loc_haystack]; + + if (any_multiple_haystack) { + loc_first_multiple_haystack = loc_haystack; + + switch (relationship) { + case VCTRS_RELATIONSHIP_one_to_one: + stop_matches_relationship_one_to_one( + loc_first_multiple_haystack, + "haystack", + needles_arg, + haystack_arg, + error_call + ); + case VCTRS_RELATIONSHIP_one_to_many: + stop_matches_relationship_one_to_many( + loc_first_multiple_haystack, + needles_arg, + haystack_arg, + error_call + ); + case VCTRS_RELATIONSHIP_warn_many_to_many: { + if (any_multiple_needles) { + warn_matches_relationship_many_to_many( + loc_first_multiple_needles, + loc_first_multiple_haystack, + needles_arg, + haystack_arg, + error_call + ); + } + + // We know there are multiple and don't need to continue checking + check_multiple_haystack = false; + + // Only continue tracking if needed for `remaining` + track_matches_haystack = retain_remaining_haystack; + + break; + } + default: + r_stop_internal("`check_multiple_haystack` should have been false."); + } + } + } + + // This haystack value was a match, so it isn't "remaining". + v_detect_matches_haystack[loc_haystack] = true; } ++loc_out; @@ -1881,7 +2092,7 @@ r_obj* expand_compact_indices(const int* v_o_haystack, v_out_haystack = r_int_begin(out_haystack); } - if (any_multiple && any_non_equi) { + if (any_multiple_needles && any_non_equi) { // If we had multiple matches and we were doing a non-equi join, then // the needles column will be correct, but any group of multiple matches in // the haystack column will be ordered incorrectly within the needle group. @@ -1914,19 +2125,24 @@ r_obj* expand_compact_indices(const int* v_o_haystack, if (retain_remaining_haystack) { r_ssize n_remaining_haystack = 0; - for (r_ssize i = 0; i < size_haystack; ++i) { - if (!v_detect_remaining_haystack[i]) { - continue; + switch (remaining->action) { + case VCTRS_REMAINING_ACTION_error: { + for (r_ssize i = 0; i < size_haystack; ++i) { + if (!v_detect_matches_haystack[i]) { + stop_matches_remaining(i, needles_arg, haystack_arg, error_call); + } } - - if (remaining->action == VCTRS_REMAINING_ACTION_error) { - stop_matches_remaining(i, needles_arg, haystack_arg, error_call); + break; + } + case VCTRS_REMAINING_ACTION_value: { + for (r_ssize i = 0; i < size_haystack; ++i) { + n_remaining_haystack += !v_detect_matches_haystack[i]; } - - // Overwrite with location, this moves all "remaining" locations to the - // front so we can loop over them sequentially - v_detect_remaining_haystack[n_remaining_haystack] = i; - ++n_remaining_haystack; + break; + } + case VCTRS_REMAINING_ACTION_drop: { + r_stop_internal("`remaining` should never be 'drop' here."); + } } if (n_remaining_haystack > 0) { @@ -1942,9 +2158,17 @@ r_obj* expand_compact_indices(const int* v_o_haystack, v_out_haystack = r_int_begin(out_haystack); // Add in "remaining" values at the end of the output - for (r_ssize i = size_out, j = 0; i < new_size_out; ++i, ++j) { + for (r_ssize i = size_out; i < new_size_out; ++i) { v_out_needles[i] = remaining->value; - v_out_haystack[i] = v_detect_remaining_haystack[j] + 1; + } + + r_ssize j = size_out; + + for (r_ssize i = 0; i < size_haystack; ++i) { + if (!v_detect_matches_haystack[i]) { + v_out_haystack[j] = i + 1; + ++j; + } } size_out = new_size_out; @@ -2537,6 +2761,115 @@ void warn_matches_multiple(r_ssize i, FREE(5); } +static inline +void stop_matches_relationship_one_to_one(r_ssize i, + const char* which, + struct vctrs_arg* needles_arg, + struct vctrs_arg* haystack_arg, + struct r_lazy call) { + r_obj* syms[6] = { + syms_i, + syms_which, + syms_needles_arg, + syms_haystack_arg, + syms_call, + NULL + }; + r_obj* args[6] = { + KEEP(r_int((int)i + 1)), + KEEP(r_chr(which)), + KEEP(vctrs_arg(needles_arg)), + KEEP(vctrs_arg(haystack_arg)), + KEEP(r_lazy_eval_protect(call)), + NULL + }; + + r_obj* ffi_call = KEEP(r_call_n(syms_stop_matches_relationship_one_to_one, syms, args)); + Rf_eval(ffi_call, vctrs_ns_env); + + never_reached("stop_matches_relationship_one_to_one"); +} + +static inline +void stop_matches_relationship_one_to_many(r_ssize i, + struct vctrs_arg* needles_arg, + struct vctrs_arg* haystack_arg, + struct r_lazy call) { + r_obj* syms[5] = { + syms_i, + syms_needles_arg, + syms_haystack_arg, + syms_call, + NULL + }; + r_obj* args[5] = { + KEEP(r_int((int)i + 1)), + KEEP(vctrs_arg(needles_arg)), + KEEP(vctrs_arg(haystack_arg)), + KEEP(r_lazy_eval_protect(call)), + NULL + }; + + r_obj* ffi_call = KEEP(r_call_n(syms_stop_matches_relationship_one_to_many, syms, args)); + Rf_eval(ffi_call, vctrs_ns_env); + + never_reached("stop_matches_relationship_one_to_many"); +} + +static inline +void stop_matches_relationship_many_to_one(r_ssize i, + struct vctrs_arg* needles_arg, + struct vctrs_arg* haystack_arg, + struct r_lazy call) { + r_obj* syms[5] = { + syms_i, + syms_needles_arg, + syms_haystack_arg, + syms_call, + NULL + }; + r_obj* args[5] = { + KEEP(r_int((int)i + 1)), + KEEP(vctrs_arg(needles_arg)), + KEEP(vctrs_arg(haystack_arg)), + KEEP(r_lazy_eval_protect(call)), + NULL + }; + + r_obj* ffi_call = KEEP(r_call_n(syms_stop_matches_relationship_many_to_one, syms, args)); + Rf_eval(ffi_call, vctrs_ns_env); + + never_reached("stop_matches_relationship_many_to_one"); +} + +static inline +void warn_matches_relationship_many_to_many(r_ssize i, + r_ssize j, + struct vctrs_arg* needles_arg, + struct vctrs_arg* haystack_arg, + struct r_lazy call) { + r_obj* syms[6] = { + syms_i, + syms_j, + syms_needles_arg, + syms_haystack_arg, + syms_call, + NULL + }; + r_obj* args[6] = { + KEEP(r_int((int)i + 1)), + KEEP(r_int((int)j + 1)), + KEEP(vctrs_arg(needles_arg)), + KEEP(vctrs_arg(haystack_arg)), + KEEP(r_lazy_eval_protect(call)), + NULL + }; + + r_obj* ffi_call = KEEP(r_call_n(syms_warn_matches_relationship_many_to_many, syms, args)); + Rf_eval(ffi_call, vctrs_ns_env); + FREE(6); +} + // ----------------------------------------------------------------------------- void vctrs_init_match(r_obj* ns) { diff --git a/src/utils.c b/src/utils.c index 97ec8dcb1..a4cd4033c 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1554,6 +1554,7 @@ SEXP chrs_smallest = NULL; SEXP chrs_which = NULL; SEXP syms_i = NULL; +SEXP syms_j = NULL; SEXP syms_n = NULL; SEXP syms_x = NULL; SEXP syms_y = NULL; @@ -1605,6 +1606,10 @@ SEXP syms_stop_matches_remaining = NULL; SEXP syms_stop_matches_incomplete = NULL; SEXP syms_stop_matches_multiple = NULL; SEXP syms_warn_matches_multiple = NULL; +SEXP syms_stop_matches_relationship_one_to_one = NULL; +SEXP syms_stop_matches_relationship_one_to_many = NULL; +SEXP syms_stop_matches_relationship_many_to_one = NULL; +SEXP syms_warn_matches_relationship_many_to_many = NULL; SEXP syms_action = NULL; SEXP syms_vctrs_common_class_fallback = NULL; SEXP syms_fallback_class = NULL; @@ -1615,6 +1620,7 @@ SEXP syms_actual = NULL; SEXP syms_required = NULL; SEXP syms_call = NULL; SEXP syms_dot_call = NULL; +SEXP syms_which = NULL; SEXP fns_bracket = NULL; SEXP fns_quote = NULL; @@ -1828,6 +1834,7 @@ void vctrs_init_utils(SEXP ns) { INTEGER(vctrs_shared_zero_int)[0] = 0; syms_i = Rf_install("i"); + syms_j = Rf_install("j"); syms_n = Rf_install("n"); syms_x = Rf_install("x"); syms_y = Rf_install("y"); @@ -1880,6 +1887,10 @@ void vctrs_init_utils(SEXP ns) { syms_stop_matches_incomplete = Rf_install("stop_matches_incomplete"); syms_stop_matches_multiple = Rf_install("stop_matches_multiple"); syms_warn_matches_multiple = Rf_install("warn_matches_multiple"); + syms_stop_matches_relationship_one_to_one = Rf_install("stop_matches_relationship_one_to_one"); + syms_stop_matches_relationship_one_to_many = Rf_install("stop_matches_relationship_one_to_many"); + syms_stop_matches_relationship_many_to_one = Rf_install("stop_matches_relationship_many_to_one"); + syms_warn_matches_relationship_many_to_many = Rf_install("warn_matches_relationship_many_to_many"); syms_action = Rf_install("action"); syms_vctrs_common_class_fallback = Rf_install(c_strs_vctrs_common_class_fallback); syms_fallback_class = Rf_install("fallback_class"); @@ -1890,6 +1901,7 @@ void vctrs_init_utils(SEXP ns) { syms_required = Rf_install("required"); syms_call = Rf_install("call"); syms_dot_call = Rf_install(".call"); + syms_which = Rf_install("which"); fns_bracket = Rf_findVar(syms_bracket, R_BaseEnv); fns_quote = Rf_findVar(Rf_install("quote"), R_BaseEnv); diff --git a/src/utils.h b/src/utils.h index f195d328a..3cdb1dc6b 100644 --- a/src/utils.h +++ b/src/utils.h @@ -448,6 +448,7 @@ extern SEXP chrs_smallest; extern SEXP chrs_which; extern SEXP syms_i; +extern SEXP syms_j; extern SEXP syms_n; extern SEXP syms_x; extern SEXP syms_y; @@ -497,6 +498,10 @@ extern SEXP syms_stop_matches_remaining; extern SEXP syms_stop_matches_incomplete; extern SEXP syms_stop_matches_multiple; extern SEXP syms_warn_matches_multiple; +extern SEXP syms_stop_matches_relationship_one_to_one; +extern SEXP syms_stop_matches_relationship_one_to_many; +extern SEXP syms_stop_matches_relationship_many_to_one; +extern SEXP syms_warn_matches_relationship_many_to_many; extern SEXP syms_action; extern SEXP syms_vctrs_common_class_fallback; extern SEXP syms_fallback_class; @@ -507,6 +512,7 @@ extern SEXP syms_actual; extern SEXP syms_required; extern SEXP syms_call; extern SEXP syms_dot_call; +extern SEXP syms_which; static const char * const c_strs_vctrs_common_class_fallback = "vctrs:::common_class_fallback"; diff --git a/tests/testthat/_snaps/match.md b/tests/testthat/_snaps/match.md index f9c3d18f7..fbddbabe6 100644 --- a/tests/testthat/_snaps/match.md +++ b/tests/testthat/_snaps/match.md @@ -85,6 +85,34 @@ Error in `vec_locate_matches()`: ! `incomplete` must be one of: "compare", "match", "drop", or "error". +# `multiple` is validated + + Code + (expect_error(vec_locate_matches(1, 2, multiple = 1.5))) + Output + + Error in `vec_locate_matches()`: + ! `multiple` must be a string. + Code + (expect_error(vec_locate_matches(1, 2, multiple = c("first", "last")))) + Output + + Error in `vec_locate_matches()`: + ! `multiple` must be a string. + Code + (expect_error(vec_locate_matches(1, 2, multiple = "x"))) + Output + + Error in `vec_locate_matches()`: + ! `multiple` must be one of "all", "any", "first", or "last". + Code + (expect_error(vec_locate_matches(1, 2, multiple = "x", error_call = call("fn"))) + ) + Output + + Error in `vec_locate_matches()`: + ! `multiple` must be one of "all", "any", "first", or "last". + # `multiple` can error informatively Code @@ -153,33 +181,245 @@ Each element of `foo` can match at most 1 observation from `bar`. x The element at location 1 has multiple matches. -# `multiple` is validated +# `relationship` handles one-to-one case Code - (expect_error(vec_locate_matches(1, 2, multiple = 1.5))) + (expect_error(vec_locate_matches(c(2, 1), c(1, 1), relationship = "one_to_one")) + ) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 2 has multiple matches. + Code + (expect_error(vec_locate_matches(c(1, 1), c(1, 2), relationship = "one_to_one")) + ) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 1 has multiple matches. + +# `relationship` handles one-to-many case + + Code + (expect_error(vec_locate_matches(c(1, 2, 2), c(2, 1), relationship = "one_to_many")) + ) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 1 has multiple matches. + +# `relationship` handles many-to-one case + + Code + (expect_error(vec_locate_matches(c(1, 2), c(1, 2, 2), relationship = "many_to_one")) + ) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 2 has multiple matches. + +# `relationship` handles warn-many-to-many case + + Code + (expect_warning(vec_locate_matches(c(1, 2, 1), c(1, 2, 2), relationship = "warn_many_to_many")) + ) + Output + + Warning in `vec_locate_matches()`: + Detected an unexpected many-to-many relationship. + x The element at location 2 has multiple matches. + x The element at location 1 has multiple matches. + Code + (expect_warning(vec_locate_matches(c(1, 1, 2), c(2, 2, 1), relationship = "warn_many_to_many")) + ) + Output + + Warning in `vec_locate_matches()`: + Detected an unexpected many-to-many relationship. + x The element at location 3 has multiple matches. + x The element at location 3 has multiple matches. + +# `relationship` considers `incomplete` matches as possible multiple matches + + Code + (expect_error(vec_locate_matches(x, y, relationship = "one_to_many"))) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 1 has multiple matches. + +# `relationship` errors on multiple matches that come from different nesting containers + + Code + (expect_error(vec_locate_matches(df, df2, condition = c("<=", "<="), + relationship = "many_to_one"))) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 1 has multiple matches. + +# `relationship` errors when a match from a different nesting container is processed early on + + Code + (expect_error(vec_locate_matches(needles, haystack, condition = "<", + relationship = "many_to_one"))) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 1 has multiple matches. + +# `relationship` can still detect problematic `haystack` relationships when `multiple = first/last` are used + + Code + (expect_error(vec_locate_matches(c(3, 1, 1), c(2, 1, 3, 3), multiple = "first", + relationship = "one_to_one"))) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 2 has multiple matches. + Code + (expect_error(vec_locate_matches(c(3, 1, 1), c(2, 1, 3, 3), multiple = "first", + relationship = "one_to_many"))) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 2 has multiple matches. + +# `relationship` and `remaining` work properly together + + Code + out <- vec_locate_matches(c(1, 2, 2), c(2, 3, 1, 1, 4), relationship = "warn_many_to_many", + remaining = NA_integer_) + Condition + Warning in `vec_locate_matches()`: + Detected an unexpected many-to-many relationship. + x The element at location 1 has multiple matches. + x The element at location 1 has multiple matches. + +# `relationship` errors if `condition` creates multiple matches + + Code + (expect_error(vec_locate_matches(1, c(1, 2), condition = "<=", relationship = "many_to_one")) + ) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 1 has multiple matches. + +# `relationship` still errors if `filter` hasn't removed all multiple matches + + Code + (expect_error(vec_locate_matches(1, c(1, 2, 1), condition = "<=", filter = "min", + relationship = "many_to_one"))) + Output + + Error in `vec_locate_matches()`: + ! Each element can match at most 1 observation. + x The element at location 1 has multiple matches. + +# `relationship` errors respect argument tags and error call + + Code + (expect_error(vec_locate_matches(1L, c(1L, 1L), relationship = "one_to_one", + needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + Output + + Error in `fn()`: + ! Each element of `foo` can match at most 1 observation from `bar`. + x The element of `foo` at location 1 has multiple matches. + Code + (expect_error(vec_locate_matches(c(1L, 1L), 1L, relationship = "one_to_one", + needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + Output + + Error in `fn()`: + ! Each element of `bar` can match at most 1 observation from `foo`. + x The element of `bar` at location 1 has multiple matches. + Code + (expect_error(vec_locate_matches(c(1L, 1L), 1L, relationship = "one_to_many", + needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + Output + + Error in `fn()`: + ! Each element of `bar` can match at most 1 observation from `foo`. + x The element of `bar` at location 1 has multiple matches. + Code + (expect_error(vec_locate_matches(1L, c(1L, 1L), relationship = "many_to_one", + needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + Output + + Error in `fn()`: + ! Each element of `foo` can match at most 1 observation from `bar`. + x The element of `foo` at location 1 has multiple matches. + +# `relationship` warnings respect argument tags and error call + + Code + (expect_warning(vec_locate_matches(c(1L, 1L), c(1L, 1L), relationship = "warn_many_to_many", + needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + Output + + Warning in `fn()`: + Detected an unexpected many-to-many relationship between `foo` and `bar`. + x The element of `foo` at location 1 has multiple matches. + x The element of `bar` at location 1 has multiple matches. + Code + (expect_warning(vec_locate_matches(c(1L, 1L), c(1L, 1L), relationship = "warn_many_to_many", + needles_arg = "foo", error_call = call("fn")))) + Output + + Warning in `fn()`: + Detected an unexpected many-to-many relationship. + x The element of `foo` at location 1 has multiple matches. + x The element at location 1 has multiple matches. + Code + (expect_warning(vec_locate_matches(c(1L, 1L), c(1L, 1L), relationship = "warn_many_to_many", + haystack_arg = "bar", error_call = call("fn")))) + Output + + Warning in `fn()`: + Detected an unexpected many-to-many relationship. + x The element at location 1 has multiple matches. + x The element of `bar` at location 1 has multiple matches. + +# `relationship` is validated + + Code + (expect_error(vec_locate_matches(1, 2, relationship = 1.5))) Output Error in `vec_locate_matches()`: - ! `multiple` must be a string. + ! `relationship` must be a string. Code - (expect_error(vec_locate_matches(1, 2, multiple = c("first", "last")))) + (expect_error(vec_locate_matches(1, 2, relationship = c("one_to_one", + "one_to_many")))) Output Error in `vec_locate_matches()`: - ! `multiple` must be a string. + ! `relationship` must be a string. Code - (expect_error(vec_locate_matches(1, 2, multiple = "x"))) + (expect_error(vec_locate_matches(1, 2, relationship = "x"))) Output Error in `vec_locate_matches()`: - ! `multiple` must be one of "all", "any", "first", "last", "warning", or "error". + ! `relationship` must be one of "none", "one_to_one", "one_to_many", "many_to_one", "many_to_many", or "warn_many_to_many". Code - (expect_error(vec_locate_matches(1, 2, multiple = "x", error_call = call("fn"))) - ) + (expect_error(vec_locate_matches(1, 2, relationship = "x", error_call = call( + "fn")))) Output Error in `vec_locate_matches()`: - ! `multiple` must be one of "all", "any", "first", "last", "warning", or "error". + ! `relationship` must be one of "none", "one_to_one", "one_to_many", "many_to_one", "many_to_many", or "warn_many_to_many". # `no_match` can error informatively diff --git a/tests/testthat/test-match.R b/tests/testthat/test-match.R index 98fb9a1a3..7fe1d67cf 100644 --- a/tests/testthat/test-match.R +++ b/tests/testthat/test-match.R @@ -905,6 +905,71 @@ test_that("duplicate needles match the same haystack locations", { expect_identical(x$haystack, c(1L, 3L, 2L, 1L, 3L, 2L)) }) +test_that("correctly gets all matches when they come from different nesting containers", { + needles <- data_frame( + a = c(1, 8), + b = c(2, 9) + ) + haystack <- data_frame( + a = c(6, 5), + b = c(6, 7) + ) + + expect_identical( + vec_locate_matches(needles, haystack, condition = "<", multiple = "all"), + data_frame(needles = c(1L, 1L, 2L), haystack = c(1L, 2L, NA)) + ) +}) + +test_that("correctly gets first/last/any match when they come from different nesting containers", { + needles <- data_frame( + a = c(1, 8), + b = c(2, 9) + ) + haystack <- data_frame( + a = c(6, 5, 0), + b = c(6, 7, 1) + ) + + expect_identical( + vec_locate_matches(needles, haystack, condition = "<", multiple = "first"), + data_frame(needles = c(1L, 2L), haystack = c(1L, NA)) + ) + expect_identical( + vec_locate_matches(needles, haystack, condition = "<", multiple = "last"), + data_frame(needles = c(1L, 2L), haystack = c(2L, NA)) + ) + expect_identical( + vec_locate_matches(needles, haystack, condition = "<", multiple = "any"), + data_frame(needles = c(1L, 2L), haystack = c(2L, NA)) + ) + expect_identical( + vec_locate_matches(needles, haystack, condition = "<", multiple = "first", remaining = NA_integer_), + data_frame(needles = c(1L, 2L, NA, NA), haystack = c(1L, NA, 2L, 3L)) + ) + expect_identical( + vec_locate_matches(needles, haystack, condition = "<", multiple = "last", remaining = NA_integer_), + data_frame(needles = c(1L, 2L, NA, NA), haystack = c(2L, NA, 1L, 3L)) + ) + expect_identical( + vec_locate_matches(needles, haystack, condition = "<", multiple = "any", remaining = NA_integer_), + data_frame(needles = c(1L, 2L, NA, NA), haystack = c(2L, NA, 1L, 3L)) + ) +}) + +test_that("`multiple` is validated", { + expect_snapshot({ + (expect_error(vec_locate_matches(1, 2, multiple = 1.5))) + (expect_error(vec_locate_matches(1, 2, multiple = c("first", "last")))) + (expect_error(vec_locate_matches(1, 2, multiple = "x"))) + # Uses internal error + (expect_error(vec_locate_matches(1, 2, multiple = "x", error_call = call("fn")))) + }) +}) + +# ------------------------------------------------------------------------------ +# vec_locate_matches() - `multiple` (deprecated) + test_that("`multiple` can error informatively", { expect_snapshot({ (expect_error(vec_locate_matches(1L, c(1L, 1L), multiple = "error"))) @@ -970,71 +1035,275 @@ test_that("errors when a match from a different nesting container is processed e ) }) -test_that("correctly gets all matches when they come from different nesting containers", { - needles <- data_frame( - a = c(1, 8), - b = c(2, 9) +test_that("`multiple = 'error'` doesn't error errneously on the last observation", { + expect_error(res <- vec_locate_matches(1:2, 1:2, multiple = "error"), NA) + expect_identical(res$needles, 1:2) + expect_identical(res$haystack, 1:2) +}) + +# ------------------------------------------------------------------------------ +# vec_locate_matches() - `relationship` + +test_that("`relationship` handles one-to-one case", { + # No error + expect_identical( + vec_locate_matches(1:2, 2:1, relationship = "one_to_one"), + vec_locate_matches(1:2, 2:1) ) - haystack <- data_frame( - a = c(6, 5), - b = c(6, 7) + + # Doesn't care about the zero match case + expect_identical( + vec_locate_matches(1:2, 3:4, relationship = "one_to_one"), + vec_locate_matches(1:2, 3:4) ) + expect_snapshot({ + (expect_error(vec_locate_matches(c(2, 1), c(1, 1), relationship = "one_to_one"))) + (expect_error(vec_locate_matches(c(1, 1), c(1, 2), relationship = "one_to_one"))) + }) +}) + +test_that("`relationship` handles one-to-many case", { + # No error expect_identical( - vec_locate_matches(needles, haystack, condition = "<", multiple = "all"), - data_frame(needles = c(1L, 1L, 2L), haystack = c(1L, 2L, NA)) + vec_locate_matches(c(1, 2), c(1, 2, 2), relationship = "one_to_many"), + vec_locate_matches(c(1, 2), c(1, 2, 2)) + ) + + # Doesn't care about the zero match case + expect_identical( + vec_locate_matches(1:2, 3:4, relationship = "one_to_many"), + vec_locate_matches(1:2, 3:4) ) + + expect_snapshot({ + (expect_error(vec_locate_matches(c(1, 2, 2), c(2, 1), relationship = "one_to_many"))) + }) }) -test_that("correctly gets first/last/any match when they come from different nesting containers", { - needles <- data_frame( - a = c(1, 8), - b = c(2, 9) +test_that("`relationship` handles many-to-one case", { + # No error + expect_identical( + vec_locate_matches(c(1, 2, 2), c(1, 2), relationship = "many_to_one"), + vec_locate_matches(c(1, 2, 2), c(1, 2)) ) - haystack <- data_frame( - a = c(6, 5, 0), - b = c(6, 7, 1) + + # Doesn't care about the zero match case + expect_identical( + vec_locate_matches(1:2, 3:4, relationship = "many_to_one"), + vec_locate_matches(1:2, 3:4) ) + expect_snapshot({ + (expect_error(vec_locate_matches(c(1, 2), c(1, 2, 2), relationship = "many_to_one"))) + }) +}) + +test_that("`relationship` handles many-to-many case", { + # No error expect_identical( - vec_locate_matches(needles, haystack, condition = "<", multiple = "first"), - data_frame(needles = c(1L, 2L), haystack = c(1L, NA)) + vec_locate_matches(c(1, 2, 2), c(1, 2), relationship = "many_to_many"), + vec_locate_matches(c(1, 2, 2), c(1, 2)) ) + + # No error expect_identical( - vec_locate_matches(needles, haystack, condition = "<", multiple = "last"), - data_frame(needles = c(1L, 2L), haystack = c(2L, NA)) + vec_locate_matches(c(1, 2), c(1, 2, 2), relationship = "many_to_many"), + vec_locate_matches(c(1, 2), c(1, 2, 2)) ) + + # No error expect_identical( - vec_locate_matches(needles, haystack, condition = "<", multiple = "any"), - data_frame(needles = c(1L, 2L), haystack = c(2L, NA)) + vec_locate_matches(c(1, 1, 2), c(1, 2, 2), relationship = "many_to_many"), + vec_locate_matches(c(1, 1, 2), c(1, 2, 2)) ) + + # Doesn't care about the zero match case expect_identical( - vec_locate_matches(needles, haystack, condition = "<", multiple = "first", remaining = NA_integer_), - data_frame(needles = c(1L, 2L, NA, NA), haystack = c(1L, NA, 2L, 3L)) + vec_locate_matches(1:2, 3:4, relationship = "many_to_many"), + vec_locate_matches(1:2, 3:4) ) +}) + +test_that("`relationship` handles warn-many-to-many case", { + # No warning expect_identical( - vec_locate_matches(needles, haystack, condition = "<", multiple = "last", remaining = NA_integer_), - data_frame(needles = c(1L, 2L, NA, NA), haystack = c(2L, NA, 1L, 3L)) + expect_silent( + vec_locate_matches(c(1, 2, 2), c(1, 2), relationship = "warn_many_to_many") + ), + vec_locate_matches(c(1, 2, 2), c(1, 2)) ) + + # No warning expect_identical( - vec_locate_matches(needles, haystack, condition = "<", multiple = "any", remaining = NA_integer_), - data_frame(needles = c(1L, 2L, NA, NA), haystack = c(2L, NA, 1L, 3L)) + expect_silent( + vec_locate_matches(c(1, 2), c(1, 2, 2), relationship = "warn_many_to_many") + ), + vec_locate_matches(c(1, 2), c(1, 2, 2)) ) + + # Doesn't care about the zero match case + expect_identical( + expect_silent( + vec_locate_matches(1:2, 3:4, relationship = "warn_many_to_many") + ), + vec_locate_matches(1:2, 3:4) + ) + + # Specifically designed to ensure we test both: + # - Finding multiple `needles` matches before multiple `haystack` matches + # - Finding multiple `haystack` matches before multiple `needles` matches + expect_snapshot({ + (expect_warning(vec_locate_matches(c(1, 2, 1), c(1, 2, 2), relationship = "warn_many_to_many"))) + (expect_warning(vec_locate_matches(c(1, 1, 2), c(2, 2, 1), relationship = "warn_many_to_many"))) + }) }) -test_that("`multiple = 'error'` doesn't error errneously on the last observation", { - expect_error(res <- vec_locate_matches(1:2, 1:2, multiple = "error"), NA) +test_that("`relationship` considers `incomplete` matches as possible multiple matches", { + x <- c(1, NA, NaN) + y <- c(NA, 1) + + expect_snapshot({ + (expect_error(vec_locate_matches(x, y, relationship = "one_to_many"))) + }) + + # No error + expect_identical( + vec_locate_matches(x, y, relationship = "one_to_many", incomplete = NA), + vec_locate_matches(x, y, incomplete = NA) + ) + + # No error + expect_identical( + vec_locate_matches(x, y, relationship = "one_to_many", nan_distinct = TRUE), + vec_locate_matches(x, y, nan_distinct = TRUE) + ) +}) + +test_that("`relationship` errors on multiple matches that come from different nesting containers", { + df <- data_frame(x = 0, y = 0) + df2 <- data_frame(x = 1:2, y = 2:1) + + expect_snapshot({ + (expect_error(vec_locate_matches(df, df2, condition = c("<=", "<="), relationship = "many_to_one"))) + }) +}) + +test_that("`relationship` errors when a match from a different nesting container is processed early on", { + # Row 1 has 2 matches + # Row 2 has 0 matches + needles <- data_frame( + a = c(1, 8), + b = c(2, 9) + ) + + # Rows 1 and 2 end up in different nesting containers + haystack <- data_frame( + a = c(5, 6), + b = c(7, 6) + ) + + # needles[1,] records the haystack[1,] match first, which is in the 1st + # value of `loc_first_match_o_haystack`, then records the haystack[3,] match + # which is in the 3rd value of `loc_first_match_o_haystack` even though it + # is processed 2nd (i.e. we need to use `loc` rather than `i` when detecting + # multiple matches) + expect_snapshot({ + (expect_error(vec_locate_matches(needles, haystack, condition = "<", relationship = "many_to_one"))) + }) +}) + +test_that("`relationship` doesn't error errneously on the last observation", { + expect_error(res <- vec_locate_matches(1:2, 1:2, relationship = "many_to_one"), NA) expect_identical(res$needles, 1:2) expect_identical(res$haystack, 1:2) }) -test_that("`multiple` is validated", { +test_that("`relationship` doesn't error if `multiple` removes multiple matches", { + out <- vec_locate_matches(c(1, 2), c(1, 1), multiple = "any", relationship = "one_to_one") + expect_identical(out$needles, c(1L, 2L)) + expect_identical(out$haystack, c(1L, NA)) + + out <- vec_locate_matches(c(1, 2), c(1, 1), multiple = "first", relationship = "one_to_one") + expect_identical(out$needles, c(1L, 2L)) + expect_identical(out$haystack, c(1L, NA)) + + out <- vec_locate_matches(c(1, 2), c(1, 1), multiple = "last", relationship = "one_to_one") + expect_identical(out$needles, c(1L, 2L)) + expect_identical(out$haystack, c(2L, NA)) +}) + +test_that("`relationship` can still detect problematic `haystack` relationships when `multiple = first/last` are used", { expect_snapshot({ - (expect_error(vec_locate_matches(1, 2, multiple = 1.5))) - (expect_error(vec_locate_matches(1, 2, multiple = c("first", "last")))) - (expect_error(vec_locate_matches(1, 2, multiple = "x"))) + (expect_error(vec_locate_matches(c(3, 1, 1), c(2, 1, 3, 3), multiple = "first", relationship = "one_to_one"))) + (expect_error(vec_locate_matches(c(3, 1, 1), c(2, 1, 3, 3), multiple = "first", relationship = "one_to_many"))) + }) +}) + +test_that("`relationship` and `remaining` work properly together", { + expect_snapshot({ + out <- vec_locate_matches( + c(1, 2, 2), + c(2, 3, 1, 1, 4), + relationship = "warn_many_to_many", + remaining = NA_integer_ + ) + }) + expect_identical(out$needles, c(1L, 1L, 2L, 3L, NA, NA)) + expect_identical(out$haystack, c(3L, 4L, 1L, 1L, 2L, 5L)) +}) + +test_that("`relationship` errors if `condition` creates multiple matches", { + expect_snapshot({ + (expect_error(vec_locate_matches(1, c(1, 2), condition = "<=", relationship = "many_to_one"))) + }) +}) + +test_that("`relationship` doesn't error if `filter` removes multiple matches", { + out <- vec_locate_matches(1, c(1, 2), condition = "<=", filter = "min", relationship = "many_to_one") + expect_identical(out$needles, 1L) + expect_identical(out$haystack, 1L) + + out <- vec_locate_matches(1, c(1, 2), condition = "<=", filter = "max", relationship = "many_to_one") + expect_identical(out$needles, 1L) + expect_identical(out$haystack, 2L) +}) + +test_that("`relationship` still errors if `filter` hasn't removed all multiple matches", { + expect_snapshot({ + (expect_error(vec_locate_matches(1, c(1, 2, 1), condition = "<=", filter = "min", relationship = "many_to_one"))) + }) + + # But not here + out <- vec_locate_matches(c(1, 1), c(1, 2, 1), condition = "<=", filter = "max", relationship = "many_to_one") + expect_identical(out$needles, c(1L, 2L)) + expect_identical(out$haystack, c(2L, 2L)) +}) + +test_that("`relationship` errors respect argument tags and error call", { + expect_snapshot({ + (expect_error(vec_locate_matches(1L, c(1L, 1L), relationship = "one_to_one", needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + (expect_error(vec_locate_matches(c(1L, 1L), 1L, relationship = "one_to_one", needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + (expect_error(vec_locate_matches(c(1L, 1L), 1L, relationship = "one_to_many", needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + (expect_error(vec_locate_matches(1L, c(1L, 1L), relationship = "many_to_one", needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + }) +}) + +test_that("`relationship` warnings respect argument tags and error call", { + expect_snapshot({ + (expect_warning(vec_locate_matches(c(1L, 1L), c(1L, 1L), relationship = "warn_many_to_many", needles_arg = "foo", haystack_arg = "bar", error_call = call("fn")))) + (expect_warning(vec_locate_matches(c(1L, 1L), c(1L, 1L), relationship = "warn_many_to_many", needles_arg = "foo", error_call = call("fn")))) + (expect_warning(vec_locate_matches(c(1L, 1L), c(1L, 1L), relationship = "warn_many_to_many", haystack_arg = "bar", error_call = call("fn")))) + }) +}) + +test_that("`relationship` is validated", { + expect_snapshot({ + (expect_error(vec_locate_matches(1, 2, relationship = 1.5))) + (expect_error(vec_locate_matches(1, 2, relationship = c("one_to_one", "one_to_many")))) + (expect_error(vec_locate_matches(1, 2, relationship = "x"))) # Uses internal error - (expect_error(vec_locate_matches(1, 2, multiple = "x", error_call = call("fn")))) + (expect_error(vec_locate_matches(1, 2, relationship = "x", error_call = call("fn")))) }) }) From 58dcddd2b95c46fd3008e1312cdb13602dab7d31 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 21 Feb 2023 17:56:54 -0500 Subject: [PATCH 2/3] NEWS bullet --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index 55addec33..a0aefa844 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,12 @@ # vctrs (development version) +* `vec_locate_matches()` gains a new `relationship` argument that holistically + handles multiple matches between `needles` and `haystack`. In particular, + `relationship = "many_to_one"` replaces `multiple = "error"` and + `multiple = "warning"`, which have been removed from the documentation and + silently soft-deprecated. Official deprecation for those options will start in + a future release (#1791). + * `vec_slice()` has gained an `error_call` argument (#1785). * New `obj_is_vector()`, `obj_check_vector()`, and `vec_check_size()` validation From 5c40ede8664edc6d8620cd0fc5cff80f4802be9c Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 21 Feb 2023 18:14:42 -0500 Subject: [PATCH 3/3] Remove slightly incorrect comment --- src/match.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/match.c b/src/match.c index c88e29030..793e2ad2e 100644 --- a/src/match.c +++ b/src/match.c @@ -1672,7 +1672,6 @@ r_obj* expand_compact_indices(const int* v_o_haystack, bool any_multiple_needles = false; bool any_multiple_haystack = false; - // For `relationship = "warn_many_to_many"` r_ssize loc_first_multiple_needles = -1; r_ssize loc_first_multiple_haystack = -1;