-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from lewinfox/feature/weights
Feature/weights
- Loading branch information
Showing
17 changed files
with
722 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,14 @@ | ||
Type: Package | ||
Package: levitate | ||
Title: Fuzzy String Comparison | ||
Version: 0.1.0.9000 | ||
Version: 0.2.0 | ||
Authors@R: | ||
person(given = "Lewin", | ||
family = "Appleton-Fox", | ||
role = c("aut", "cre", "cph"), | ||
email = "[email protected]") | ||
person("Lewin", "Appleton-Fox", , "[email protected]", role = c("aut", "cre", "cph")) | ||
Description: Provides string similarity calculations inspired by the | ||
Python 'thefuzz' package. Compare strings by edit distance, | ||
similarity ratio, best matching substring, ordered token matching and | ||
set-based token matching. A range of edit distance measures are | ||
available thanks to the 'stringdist' package. | ||
Python 'thefuzz' package. Compare strings by edit distance, similarity | ||
ratio, best matching substring, ordered token matching and set-based | ||
token matching. A range of edit distance measures are available thanks | ||
to the 'stringdist' package. | ||
License: GPL-3 | ||
URL: https://lewinfox.github.io/levitate/, | ||
https://github.com/lewinfox/levitate/, | ||
|
@@ -20,15 +17,14 @@ BugReports: https://github.com/lewinfox/levitate/issues | |
Depends: | ||
R (>= 2.10) | ||
Imports: | ||
cli, | ||
glue, | ||
rlang, | ||
stringdist, | ||
stringr | ||
stringdist | ||
Suggests: | ||
glue, | ||
knitr, | ||
pkgdown, | ||
rmarkdown, | ||
styler, | ||
testthat | ||
VignetteBuilder: | ||
knitr | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,12 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(lev_best_match) | ||
export(lev_distance) | ||
export(lev_partial_ratio) | ||
export(lev_ratio) | ||
export(lev_score_multiple) | ||
export(lev_token_set_ratio) | ||
export(lev_token_sort_ratio) | ||
export(lev_weighted_token_ratio) | ||
export(lev_weighted_token_set_ratio) | ||
export(lev_weighted_token_sort_ratio) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#' Score multiple candidate strings against a single input | ||
#' | ||
#' Given a single `input` string and multiple `candidates`, compute scores for each candidate. | ||
#' | ||
#' @param input A single string | ||
#' @param candidates One or more candidate strings to score | ||
#' @param .fn The scoring function to use, as a string or function object. Defaults to | ||
#' [lev_ratio()]. | ||
#' @param ... Additional arguments to pass to `.fn`. | ||
#' @param decreasing If `TRUE` (the default), the candidate with the highest score is ranked first. | ||
#' If using a comparison `.fn` that computes _distance_ rather than similarity, or if you want the | ||
#' worst match to be returned first, set this to `FALSE`. | ||
#' | ||
#' @return A list where the keys are `candidates` and the values are the scores. The list is sorted | ||
#' according to the `decreasing` parameter, so by default higher scores are first. | ||
#' | ||
#' @examples | ||
#' lev_score_multiple("bilbo", c("frodo", "gandalf", "legolas")) | ||
#' @export | ||
#' @seealso [lev_best_match()] | ||
lev_score_multiple <- function(input, candidates, .fn = lev_ratio, ..., decreasing = TRUE) { | ||
if (length(input) > 1) rlang::abort(glue::glue("`input` must be length 1, not {length(input)}")) | ||
.fn <- match.fun(.fn) | ||
scores <- sort(sapply(candidates, .fn, input, ...), decreasing = decreasing) | ||
as.list(scores) | ||
} | ||
|
||
#' Get the best matched string from a list of candidates | ||
#' | ||
#' Given an `input` string and multiple `candidates`, return the candidate with the best score as | ||
#' calculated by `.fn`. | ||
#' | ||
#' @inheritParams lev_score_multiple | ||
#' @return A string | ||
#' @seealso [lev_score_multiple()] | ||
#' @examples | ||
#' lev_best_match("bilbo", c("frodo", "gandalf", "legolas")) | ||
#' @export | ||
lev_best_match <- function(input, candidates, .fn = lev_ratio, ..., decreasing = TRUE) { | ||
scores <- lev_score_multiple(input = input, candidates = candidates, .fn = .fn, ..., decreasing = decreasing) | ||
names(scores)[[1]] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
#' Weighted token similarity measure | ||
#' | ||
#' Computes similarity but allows you to assign weights to specific tokens. This is useful, for | ||
#' example, when you have a frequently-occurring string that doesn't contain useful information. See | ||
#' examples. | ||
#' | ||
#' # Details | ||
#' | ||
#' The algorithm used here is as follows: | ||
#' | ||
#' * Tokenise the input strings | ||
#' * Compute the edit distance between each pair of tokens | ||
#' * Compute the maximum edit distance between each pair of tokens | ||
#' * Apply any weights from the `weights` argument | ||
#' * Return `1 - (sum(weighted_edit_distances) / sum(weighted_max_edit_distance))` | ||
#' | ||
#' @inheritParams default-params | ||
#' @param weights List of token weights. For example, `weights = list(foo = 0.9, bar = 0.1)`. Any | ||
#' tokens omitted from `weights` will be given a weight of 1. | ||
#' | ||
#' @return A float | ||
#' @export | ||
#' | ||
#' @family weighted token functions | ||
#' | ||
#' @examples | ||
#' lev_weighted_token_ratio("jim ltd", "tim ltd") | ||
#' | ||
#' lev_weighted_token_ratio("tim ltd", "jim ltd", weights = list(ltd = 0.1)) | ||
lev_weighted_token_ratio <- function(a, b, weights = list(), ...) { | ||
if (length(a) != 1 || length(b) != 1) { | ||
rlang::abort("`a` and `b` must be length 1") | ||
} | ||
a_tokens <- unlist(str_tokenise(a)) | ||
b_tokens <- unlist(str_tokenise(b)) | ||
|
||
# If the token lists aren't the same length we will pad the shorter list with empty strings | ||
if (length(a_tokens) > length(b_tokens)) { | ||
b_tokens <- c(b_tokens, rep("", length(a_tokens) - length(b_tokens))) | ||
} else if (length(a_tokens) < length(b_tokens)) { | ||
a_tokens <- c(a_tokens, rep("", length(b_tokens) - length(a_tokens))) | ||
} | ||
|
||
token_lev_distances <- mapply(lev_distance, a_tokens, b_tokens, MoreArgs = ...) | ||
|
||
# Weights are applied where | ||
# | ||
# * a token is in the `weights` list | ||
# * AND the token appears in the same position in a and b. | ||
# * OR the token appears in a OR b and the corresponding token is missing (which has the effect | ||
# of reducing the impact of tokens that appear in one string but not the other). | ||
weights_to_apply <- mapply( | ||
function(token_a, token_b) { | ||
if (token_a == token_b && token_a %in% names(weights)) { | ||
weights[[token_a]] | ||
} else if (token_a == "" && token_b %in% names(weights)) { | ||
weights[[token_b]] | ||
} else if (token_b == "" && token_a %in% names(weights)) { | ||
weights[[token_a]] | ||
} else { | ||
1 | ||
} | ||
}, | ||
a_tokens, | ||
b_tokens | ||
) | ||
|
||
# The similarity score is (1 - (edit_distance / max_edit_distance)), after weighting. | ||
weighted_edit_distances <- token_lev_distances * weights_to_apply | ||
weighted_max_edit_distances <- mapply(function(a, b) max(nchar(a), nchar(b)), a_tokens, b_tokens) * weights_to_apply | ||
|
||
1 - (sum(weighted_edit_distances) / sum(weighted_max_edit_distances)) | ||
} | ||
|
||
#' Weighted version of lev_token_sort_ratio() | ||
#' | ||
#' This function tokenises inputs, sorts tokens and computes similarities for each pair of tokens. | ||
#' Similarity scores are weighted based on the `weights` argument, and a total similarity score is | ||
#' returned in the same manner as [lev_weighted_token_ratio()]. | ||
#' | ||
#' @inheritParams default-params | ||
#' @inheritParams lev_weighted_token_ratio | ||
#' | ||
#' @return Float | ||
#' @export | ||
#' @family weighted token functions | ||
#' @seealso [lev_token_sort_ratio()] | ||
lev_weighted_token_sort_ratio <- function(a, b, weights = list(), ...) { | ||
if (length(a) != 1 || length(b) != 1) { | ||
rlang::abort("`a` and `b` must be length 1") | ||
} | ||
lev_weighted_token_ratio(str_token_sort(a), str_token_sort(b), weights = weights, ...) | ||
} | ||
|
||
#' Weighted version of `lev_token_set_ratio()` | ||
#' | ||
#' @inheritParams default-params | ||
#' @inheritParams lev_weighted_token_ratio | ||
#' @return Float | ||
#' @family weighted token functions | ||
#' @seealso [lev_token_set_ratio()] | ||
#' @export | ||
lev_weighted_token_set_ratio <- function(a, b, weights = list(), ...) { | ||
if (length(a) != 1 || length(b) != 1) { | ||
rlang::abort("`a` and `b` must be length 1") | ||
} | ||
|
||
token_a <- unlist(str_tokenise(a)) | ||
token_b <- unlist(str_tokenise(b)) | ||
common_tokens <- sort(intersect(token_a, token_b)) | ||
unique_token_a <- sort(setdiff(token_a, token_b)) | ||
unique_token_b <- sort(setdiff(token_b, token_a)) | ||
|
||
# Construct two new strings of the form {sorted_common_tokens}{sorted_remainder_a/b} and return | ||
# a lev_weighted_token_ratio() on those | ||
new_a <- paste(c(common_tokens, unique_token_a), collapse = " ") | ||
new_b <- paste(c(common_tokens, unique_token_b), collapse = " ") | ||
|
||
# We want the max of the three pairwise comparisons between `common_tokens`, `new_a` and `new_b`. | ||
# For this to work properly we need to stick `common_tokens` back together into a single string. | ||
common_tokens <- paste(common_tokens, collapse = " ") | ||
res <- max( | ||
lev_weighted_token_ratio(common_tokens, new_a, weights = weights, ...), | ||
lev_weighted_token_ratio(common_tokens, new_b, weights = weights, ...), | ||
lev_weighted_token_ratio(new_a, new_b, weights = weights, ...) | ||
) | ||
|
||
res | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.