From ec988facdc79e80d801580d3824677773daa6dd4 Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Mon, 13 Nov 2023 13:50:39 -0600 Subject: [PATCH 1/2] cmd/server: only check adjacent terms for local jaro max score --- cmd/server/issue326_test.go | 10 +++++----- cmd/server/search.go | 24 +++++++++++++++++------- cmd/server/search_test.go | 26 +++++++++++++++++--------- 3 files changed, 39 insertions(+), 21 deletions(-) diff --git a/cmd/server/issue326_test.go b/cmd/server/issue326_test.go index d41d99a3..808bbe23 100644 --- a/cmd/server/issue326_test.go +++ b/cmd/server/issue326_test.go @@ -16,17 +16,17 @@ func TestIssue326(t *testing.T) { // Cuba score := jaroWinkler(precompute("Huawei Cuba"), precompute("Huawei")) - assert.Equal(t, score, 0.8055555555555556) + assert.Equal(t, 0.8055555555555556, score) // India score = jaroWinkler(india, precompute("Huawei")) - assert.Equal(t, score, 0.5592063492063492) + assert.Equal(t, 0.4846031746031746, score) score = jaroWinkler(india, precompute("Huawei Technologies")) - assert.Equal(t, score, 0.6903174603174603) + assert.Equal(t, 0.6903174603174603, score) // Investment score = jaroWinkler(investment, precompute("Huawei")) - assert.Equal(t, score, 0.3788888888888889) + assert.Equal(t, 0.3788888888888889, score) score = jaroWinkler(investment, precompute("Huawei Technologies")) - assert.Equal(t, score, 0.7377777777777779) + assert.Equal(t, 0.7377777777777779, score) } diff --git a/cmd/server/search.go b/cmd/server/search.go index 72504802..7663fc90 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -648,21 +648,32 @@ func readInt(override string, value int) int { // jaroWinkler runs the similarly named algorithm over the two input strings and averages their match percentages // according to the second string (assumed to be the user's query) // +// Terms are compared between a few adjacent terms and accumulate the highest near-neighbor match. +// // For more details see https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance func jaroWinkler(s1, s2 string) float64 { return jaroWinklerWithFavoritism(s1, s2, exactMatchFavoritism) } +const adjacentSimilarityPositions = 3 + func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { - maxMatch := func(word string, parts []string) float64 { + maxMatch := func(word string, s1Idx int, parts []string) float64 { if len(parts) == 0 { return 0.0 } - max := smetrics.JaroWinkler(word, parts[0], boostThreshold, prefixSize) - for i := 1; i < len(parts); i++ { - if score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize); score > max { - max = score + // We're only looking for the highest match close + start := s1Idx - adjacentSimilarityPositions + end := s1Idx + adjacentSimilarityPositions + + var max float64 + for i := start; i < end; i++ { + if i >= 0 && len(parts) > i { + score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize) + if score > max { + max = score + } } } return max @@ -675,7 +686,7 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { var scores []float64 for i := range s1Parts { - max := maxMatch(s1Parts[i], s2Parts) + max := maxMatch(s1Parts[i], i, s2Parts) if max >= 1.0 { max += favoritism } @@ -692,7 +703,6 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { for i := range scores { sum += scores[i] } - return math.Min(sum/float64(len(scores)), 1.00) } diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go index 0a27a11e..0738d508 100644 --- a/cmd/server/search_test.go +++ b/cmd/server/search_test.go @@ -451,6 +451,15 @@ func TestJaroWinkler(t *testing.T) { {"nicolas", "nicolas", 1.0}, {"nicolas moros maduro", "nicolas maduro", 0.91}, {"nicolas maduro", "nicolas moros maduro", 1.0}, + // customer examples + {"ian mckinley", "tian xiang 7", 0.750}, + {"bindaree food group pty ltd", "independent insurance group ltd", 0.812}, + {"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.753}, + {"zincum llc", "easy verification inc.", 0.639}, + {"transpetrochart co ltd", "jx metals trading co.", 0.725}, + {"technolab", "moomoo technologies inc", 0.87222}, + {"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.740}, + {"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0}, // example cases {"nicolas maduro", "nicolás maduro", 0.961}, {"nicolas maduro", precompute("nicolás maduro"), 1.0}, @@ -465,12 +474,11 @@ func TestJaroWinkler(t *testing.T) { {"nicolas, maduro moros", "nicolás", 0.627}, {"nicolas, maduro moros", "maduro", 0.720}, {"nicolas, maduro moros", "nicolás maduro", 0.877}, - {"africada financial services bureau change", "skylight", 0.352}, + {"africada financial services bureau change", "skylight", 0.266}, {"africada financial services bureau change", "skylight financial inc", 0.72}, {"africada financial services bureau change", "skylight services inc", 0.806}, {"africada financial services bureau change", "skylight financial services", 0.887}, {"africada financial services bureau change", "skylight financial services inc", 0.79}, - // stopwords tests {"the group for the preservation of the holy sites", "the bridgespan group", 1.00}, {precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 1.00}, @@ -481,15 +489,15 @@ func TestJaroWinkler(t *testing.T) { {"the group for the preservation of the holy sites", "the anything group", 1.00}, {precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 1.00}, {"group preservation holy sites", "anything group", 0.617}, - {"the group for the preservation of the holy sites", "the hello world group", 1.00}, - {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 1.00}, + {"the group for the preservation of the holy sites", "the hello world group", 0.922}, + {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.922}, {"group preservation holy sites", "hello world group", 0.687}, - {"the group for the preservation of the holy sites", "the group", 0.67}, - {precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.67}, + {"the group for the preservation of the holy sites", "the group", 0.431}, + {precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.431}, {"group preservation holy sites", "group", 0.460}, - {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.699}, - {precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), .783}, - {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.590}, + {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.517}, + {precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.572}, + {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.418}, // precompute {"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.667}, From 489fc9e0b2830f8fdf9e3cebb767b9794212e23b Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Mon, 13 Nov 2023 13:54:44 -0600 Subject: [PATCH 2/2] cmd/server: read ADJACENT_SIMILARITY_POSITIONS env var --- README.md | 1 + cmd/server/search.go | 4 +++- docs/usage-configuration.md | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 716102d3..5e4115d8 100644 --- a/README.md +++ b/README.md @@ -192,6 +192,7 @@ PONG |-----|-----|-----| | `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h | | `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty | +| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 | | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 | | `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 | | `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 | diff --git a/cmd/server/search.go b/cmd/server/search.go index 7663fc90..d9f5fcc5 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -655,7 +655,9 @@ func jaroWinkler(s1, s2 string) float64 { return jaroWinklerWithFavoritism(s1, s2, exactMatchFavoritism) } -const adjacentSimilarityPositions = 3 +var ( + adjacentSimilarityPositions = readInt(os.Getenv("ADJACENT_SIMILARITY_POSITIONS"), 3) +) func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { maxMatch := func(word string, s1Idx int, parts []string) float64 { diff --git a/docs/usage-configuration.md b/docs/usage-configuration.md index b6d2992d..d298465f 100644 --- a/docs/usage-configuration.md +++ b/docs/usage-configuration.md @@ -12,6 +12,7 @@ menubar: docs-menu |-----|-----|-----| | `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h | | `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty | +| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 | | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 | | `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 | | `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |