From e68b03dcfc9f8aafef62ae34b5a470c2aa268acb Mon Sep 17 00:00:00 2001 From: Tom Daffurn Date: Thu, 30 Nov 2023 18:21:11 +1000 Subject: [PATCH 1/7] Experimental search algo --- cmd/server/search.go | 171 +++++++++++++++++++++++++++++------ cmd/server/search_generic.go | 7 +- 2 files changed, 149 insertions(+), 29 deletions(-) diff --git a/cmd/server/search.go b/cmd/server/search.go index d5b2a73a1..e65a826e9 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -248,6 +248,7 @@ func (s *searcher) FindAlts(limit int, id string) []*ofac.AlternateIdentity { func (s *searcher) TopAltNames(limit int, minMatch float64, alt string) []Alt { alt = precompute(alt) + altTokens := strings.Fields(alt) s.RLock() defer s.RUnlock() @@ -268,7 +269,7 @@ func (s *searcher) TopAltNames(limit int, minMatch float64, alt string) []Alt { xs.add(&item{ matched: s.Alts[i].name, value: s.Alts[i], - weight: jaroWinkler(s.Alts[i].name, alt), + weight: bestPairsJaroWinkler(altTokens, s.Alts[i].name), }) }(i) } @@ -290,6 +291,114 @@ func (s *searcher) TopAltNames(limit int, minMatch float64, alt string) []Alt { return out } +// bestPairsJaroWinkler compares a search query to an indexed term (name, address, etc) and returns a decimal fraction +// score. +// +// The algorithm splits each string into tokens, and does a pairwise Jaro-Winkler score of all token combinations +// (outer product). The best match for each search token is chosen, such that each index token can be matched at most +// once. +// +// The pairwise scores are combined into an average in a way that corrects for character length, and the fraction of the +// indexed term that didn't match. +func bestPairsJaroWinkler(searchTokens []string, indexed string) float64 { + type Score struct { + score float64 + searchTokenIdx int + indexTokenIdx int + } + + indexedTokens := strings.Fields(indexed) + searchTokensLength := sumLength(searchTokens) + indexTokensLength := sumLength(indexedTokens) + + //Compare each search token to each indexed token. Sort the results in descending order + scores := make([]Score, 0) + for searchIdx, searchToken := range searchTokens { + for indexIdx, indexedToken := range indexedTokens { + score := customJaroWinkler(indexedToken, searchToken) + scores = append(scores, Score{score, searchIdx, indexIdx}) + } + } + sort.Slice(scores[:], func(i, j int) bool { + return scores[i].score > scores[j].score + }) + + //Pick the highest score for each search term, where the indexed token hasn't yet been matched + matchedSearchTokens := make([]bool, len(searchTokens)) + matchedIndexTokens := make([]bool, len(indexedTokens)) + matchedIndexTokensLength := 0 + totalWeightedScores := 0.0 + for _, score := range scores { + //If neither the search token nor index token have been matched so far + if matchedSearchTokens[score.searchTokenIdx] == false && matchedIndexTokens[score.indexTokenIdx] == false { + //Weight the importance of this word score by its character length + searchToken := searchTokens[score.searchTokenIdx] + indexToken := indexedTokens[score.indexTokenIdx] + totalWeightedScores += score.score * float64(len(searchToken)+len(indexToken)) + + matchedSearchTokens[score.searchTokenIdx] = true + matchedIndexTokens[score.indexTokenIdx] = true + matchedIndexTokensLength += len(indexToken) + } + } + lengthWeightedAverageScore := totalWeightedScores / float64(searchTokensLength+matchedIndexTokensLength) + + //If some index tokens weren't matched by any search token, penalise this search a small amount. If this isn't done, + //a query of "John Doe" will match "John Doe" and "John Bartholomew Doe" equally well. + //Calculate the fraction of the index name that wasn't matched, apply a weighting to reduce the importance of + //unmatched portion, then scale down the final score. + matchedIndexLength := 0 + for i, str := range indexedTokens { + if matchedIndexTokens[i] == true { + matchedIndexLength += len(str) + } + } + matchedFraction := float64(matchedIndexLength) / float64(indexTokensLength) + return lengthWeightedAverageScore * scalingFactor(matchedFraction, unmatchedIndexPenaltyWeight) +} + +func customJaroWinkler(s1 string, s2 string) float64 { + score := smetrics.JaroWinkler(s1, s2, boostThreshold, prefixSize) + + if lengthMetric := lengthDifferenceFactor(s1, s2); lengthMetric < lengthDifferenceCutoffFactor { + //If there's a big difference in matched token lengths, punish the score. Jaro-Winkler is quite permissive about + //different lengths + score = score * scalingFactor(lengthMetric, lengthDifferencePenaltyWeight) + } + if s1[0] != s2[0] { + //Penalise words that start with a different characters. Jaro-Winkler is too lenient on this + //TODO should use a phonetic comparison here, like Soundex + score = score * differentLetterPenaltyWeight + } + return score +} + +// scalingFactor returns a float [0,1] that can be used to scale another number down, given some metric and a desired +// weight +// e.g. If a score has a 50% value according to a metric, and we want a 10% weight to the metric: +// +// scaleFactor := scalingFactor(0.5, 0.1) // 0.95 +// scaledScore := score * scaleFactor +func scalingFactor(metric float64, weight float64) float64 { + return 1.0 - (1.0-metric)*weight +} + +func sumLength(strs []string) int { + totalLength := 0 + for _, str := range strs { + totalLength += len(str) + } + return totalLength +} + +func lengthDifferenceFactor(s1 string, s2 string) float64 { + ls1 := float64(len(s1)) + ls2 := float64(len(s2)) + min := math.Min(ls1, ls2) + max := math.Max(ls1, ls2) + return min / max +} + func (s *searcher) FindSDN(entityID string) *ofac.SDN { if sdn := s.debugSDN(entityID); sdn != nil { return sdn.SDN @@ -365,6 +474,7 @@ func (s *searcher) FindSDNsByRemarksID(limit int, id string) []*SDN { func (s *searcher) TopSDNs(limit int, minMatch float64, name string, keepSDN func(*SDN) bool) []*SDN { name = precompute(name) + nameTokens := strings.Fields(name) s.RLock() defer s.RUnlock() @@ -389,7 +499,7 @@ func (s *searcher) TopSDNs(limit int, minMatch float64, name string, keepSDN fun xs.add(&item{ matched: s.SDNs[i].name, value: s.SDNs[i], - weight: jaroWinkler(s.SDNs[i].name, name), + weight: bestPairsJaroWinkler(nameTokens, s.SDNs[i].name), }) }(i) } @@ -413,6 +523,7 @@ func (s *searcher) TopSDNs(limit int, minMatch float64, name string, keepSDN fun func (s *searcher) TopDPs(limit int, minMatch float64, name string) []DP { name = precompute(name) + nameTokens := strings.Fields(name) s.RLock() defer s.RUnlock() @@ -433,7 +544,7 @@ func (s *searcher) TopDPs(limit int, minMatch float64, name string) []DP { xs.add(&item{ matched: s.DPs[i].name, value: s.DPs[i], - weight: jaroWinkler(s.DPs[i].name, name), + weight: bestPairsJaroWinkler(nameTokens, s.DPs[i].name), }) }(i) } @@ -636,11 +747,16 @@ func precomputeDPs(persons []*dpl.DPL, pipe *pipeliner) []*DP { var ( // Jaro-Winkler parameters - boostThreshold = readFloat(os.Getenv("JARO_WINKLER_BOOST_THRESHOLD"), 0.7) + boostThreshold = readFloat(os.Getenv("JARO_WINKLER_BOOST_THRESHOLD"), 0.81) prefixSize = readInt(os.Getenv("JARO_WINKLER_PREFIX_SIZE"), 4) + //cCustomised Jaro-Winkler parameters + lengthDifferenceCutoffFactor = readFloat(os.Getenv("LENGTH_DIFFERENCE_CUTOFF_FACTOR"), 0.9) + lengthDifferencePenaltyWeight = readFloat(os.Getenv("LENGTH_DIFFERENCE_PENALTY_WEIGHT"), 0.4) + differentLetterPenaltyWeight = readFloat(os.Getenv("DIFFERENT_LETTER_PENALTY_WEIGHT"), 0.9) // Watchman parameters - exactMatchFavoritism = readFloat(os.Getenv("EXACT_MATCH_FAVORITISM"), 0.0) + exactMatchFavoritism = readFloat(os.Getenv("EXACT_MATCH_FAVORITISM"), 0.0) + unmatchedIndexPenaltyWeight = readFloat(os.Getenv("UNMATCHED_INDEX_TOKEN_WEIGHT"), 0.15) ) func readFloat(override string, value float64) float64 { @@ -679,49 +795,50 @@ var ( adjacentSimilarityPositions = readInt(os.Getenv("ADJACENT_SIMILARITY_POSITIONS"), 3) ) -func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { - maxMatch := func(word string, s1Idx int, parts []string) (float64, string) { - if word == "" || len(parts) == 0 { +func jaroWinklerWithFavoritism(indexedTerm, query string, favoritism float64) float64 { + maxMatch := func(indexedWord string, indexedWordIdx int, queryWords []string) (float64, string) { + if indexedWord == "" || len(queryWords) == 0 { return 0.0, "" } // We're only looking for the highest match close - start := s1Idx - adjacentSimilarityPositions - end := s1Idx + adjacentSimilarityPositions + start := indexedWordIdx - adjacentSimilarityPositions + end := indexedWordIdx + adjacentSimilarityPositions var max float64 var maxTerm string for i := start; i < end; i++ { - if i >= 0 && len(parts) > i { - score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize) + if i >= 0 && len(queryWords) > i { + score := smetrics.JaroWinkler(indexedWord, queryWords[i], boostThreshold, prefixSize) if score > max { max = score - maxTerm = parts[i] + maxTerm = queryWords[i] } } } return max, maxTerm } - s1Parts, s2Parts := strings.Fields(s1), strings.Fields(s2) - if len(s1Parts) == 0 || len(s2Parts) == 0 { + indexedWords, queryWords := strings.Fields(indexedTerm), strings.Fields(query) + if len(indexedWords) == 0 || len(queryWords) == 0 { return 0.0 // avoid returning NaN later on } var scores []float64 - for i := range s1Parts { - max, term := maxMatch(s1Parts[i], i, s2Parts) + for i := range indexedWords { + max, term := maxMatch(indexedWords[i], i, queryWords) + //fmt.Printf("%s maxMatch %s %f\n", indexedWords[i], term, max) if max >= 1.0 { - // If the query is longer than our indexed term (and both are longer than most names) + // If the query is longer than our indexed term (and EITHER are longer than most names) // we want to reduce the maximum weight proportionally by the term difference, which // forces more terms to match instead of one or two dominating the weight. - if (len(s2Parts) > len(s1Parts)) && (len(s1Parts) > 3 || len(s2Parts) > 3) { - max *= (float64(len(s1Parts)) / float64(len(s2Parts))) + if (len(queryWords) > len(indexedWords)) && (len(indexedWords) > 3 || len(queryWords) > 3) { + max *= (float64(len(indexedWords)) / float64(len(queryWords))) goto add } // If the indexed term is really short cap the match at 90%. // This sill allows names to match highly with a couple different characters. - if len(s1Parts) < 2 && len(s2Parts) > 1 { + if len(indexedWords) == 1 && len(queryWords) > 1 { max *= 0.9 goto add } @@ -734,14 +851,14 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { // adjust the max lower by the proportion of different terms. // // We do this to decrease the importance of a short (often common) term. - if len(s2Parts) > len(s1Parts) { - scores = append(scores, max*float64(len(s1Parts))/float64(len(s2Parts))) + if len(queryWords) > len(indexedWords) { + scores = append(scores, max*float64(len(indexedWords))/float64(len(queryWords))) continue } // Apply an additional weight based on similarity of term lengths, // so terms which are closer in length match higher. - s1 := float64(len(s1Parts[i])) + s1 := float64(len(indexedWords[i])) t := float64(len(term)) - 1 weight := math.Min(math.Abs(s1/t), 1.0) @@ -749,11 +866,11 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { } } - // average the highest N scores where N is the words in our query (s2). + // average the highest N scores where N is the words in our query (query). // Only truncate scores if there are enough words (aka more than First/Last). sort.Float64s(scores) - if len(s1Parts) > len(s2Parts) && len(s2Parts) > 5 { - scores = scores[len(s1Parts)-len(s2Parts):] + if len(indexedWords) > len(queryWords) && len(queryWords) > 5 { + scores = scores[len(indexedWords)-len(queryWords):] } var sum float64 diff --git a/cmd/server/search_generic.go b/cmd/server/search_generic.go index 8a1d2e5fd..df977fb1d 100644 --- a/cmd/server/search_generic.go +++ b/cmd/server/search_generic.go @@ -7,6 +7,7 @@ package main import ( "encoding/json" "reflect" + "strings" "sync" ) @@ -52,6 +53,8 @@ func topResults[T any](limit int, minMatch float64, name string, data []*Result[ } name = precompute(name) + nameTokens := strings.Fields(name) + xs := newLargest(limit, minMatch) var wg sync.WaitGroup @@ -64,7 +67,7 @@ func topResults[T any](limit int, minMatch float64, name string, data []*Result[ it := &item{ matched: data[i].precomputedName, value: data[i], - weight: jaroWinkler(data[i].precomputedName, name), + weight: bestPairsJaroWinkler(nameTokens, data[i].precomputedName), } for _, alt := range data[i].precomputedAlts { @@ -72,7 +75,7 @@ func topResults[T any](limit int, minMatch float64, name string, data []*Result[ continue } - score := jaroWinkler(alt, name) + score := bestPairsJaroWinkler(nameTokens, alt) if score > it.weight { it.matched = alt it.weight = score From df9ce902ee9adfce3195cabc99a18de877b166c6 Mon Sep 17 00:00:00 2001 From: Tom Daffurn Date: Fri, 8 Dec 2023 14:08:37 +1000 Subject: [PATCH 2/7] Add tests and fix tests broken by changing scores --- cmd/server/issue115_test.go | 6 +- cmd/server/new_algorithm_test.go | 100 +++++++++++++++++++++++++++++ cmd/server/search.go | 6 +- cmd/server/search_eu_csl_test.go | 2 +- cmd/server/search_handlers_test.go | 4 +- cmd/server/search_test.go | 8 +-- cmd/server/search_us_csl_test.go | 8 +-- 7 files changed, 117 insertions(+), 17 deletions(-) create mode 100644 cmd/server/new_algorithm_test.go diff --git a/cmd/server/issue115_test.go b/cmd/server/issue115_test.go index f21b944d1..fb53e158d 100644 --- a/cmd/server/issue115_test.go +++ b/cmd/server/issue115_test.go @@ -29,13 +29,13 @@ func TestIssue115__TopSDNs(t *testing.T) { s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "2680", SDNName: "HABBASH, George", SDNType: "INDIVIDUAL"}}, nil, pipe) out := s.TopSDNs(1, 0.00, "george bush", keeper) - eql(t, "issue115: top SDN 2680", out[0].match, 0.732) + eql(t, "issue115: top SDN 2680", out[0].match, 0.687) // was 88.3% match s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "9432", SDNName: "CHIWESHE, George", SDNType: "INDIVIDUAL"}}, nil, pipe) out = s.TopSDNs(1, 0.00, "george bush", keeper) - eql(t, "issue115: top SDN 18996", out[0].match, 0.764) + eql(t, "issue115: top SDN 18996", out[0].match, 0.650) // another example s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "0", SDNName: "Bush, George W", SDNType: "INDIVIDUAL"}}, nil, pipe) @@ -47,5 +47,5 @@ func TestIssue115__TopSDNs(t *testing.T) { eql(t, "issue115: top SDN 0", out[0].match, 1.0) out = s.TopSDNs(1, 0.00, "george bush", keeper) - eql(t, "issue115: top SDN 0", out[0].match, 0.667) + eql(t, "issue115: top SDN 0", out[0].match, 0.986) } diff --git a/cmd/server/new_algorithm_test.go b/cmd/server/new_algorithm_test.go new file mode 100644 index 000000000..b3945ade1 --- /dev/null +++ b/cmd/server/new_algorithm_test.go @@ -0,0 +1,100 @@ +// Copyright 2022 The Moov Authors +// Use of this source code is governed by an Apache License +// license that can be found in the LICENSE file. + +package main + +import ( + "strings" + "testing" +) + +func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) { + // Words in the query should be matched against at most one indexed word. Doubled names on the sanctioned list can + // skew results + // 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich + query := "vladimir levenshtein" + indexedName := "vladimirov vladimir vladimirovich" + score1 := jaroWinkler(indexedName, query) + score2 := bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too high: "+query, score1, 0.961) + eql(t, "New score is better: "+query, score2, 0.603) + + // 2. SDN Entity 7788 "SHAQIRI, Shaqir" + query = "zaid shakir" + indexedName = "shaqiri shaqir" + score1 = jaroWinkler(indexedName, query) + score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too high: "+query, score1, 0.908) + eql(t, "New score is better: "+query, score2, 0.704) + + // Single-word sanctioned names shouldn't match any query with that name part + // 1. SDN Entity 15050 "HADI" + query = "hadi alwai" + indexedName = "hadi" + score1 = jaroWinkler(indexedName, query) + score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too high: "+query, score1, 0.900) + eql(t, "New score is better: "+query, score2, 0.615) + + // Name-part scores should be weighted by the character length. If not, small words can have unfair weight + // 1. SDN Entity "LI, Shangfu" + query = "li shanlan" + indexedName = "li shangfu" + score1 = jaroWinkler(indexedName, query) + score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too high: "+query, score1, 0.914) + eql(t, "New score is better: "+query, score2, 0.867) + + // Words with different lengths shouldn't match very highly + query = "brown" + indexedName = "browningweight" + score1 = jaroWinkler(indexedName, query) + score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too high: "+query, score1, 0.871) + eql(t, "New score is better: "+query, score2, 0.703) + + // Words that start with different letters shouldn't match very highly + query = "jimenez" + indexedName = "dominguez" + score1 = jaroWinkler(indexedName, query) + score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too high: "+query, score1, 0.690) + eql(t, "New score is better: "+query, score2, 0.580) +} + +func TestBestPairsJaroWinkler__TruePositives(t *testing.T) { + // Unmatched indexed words had a large weight, causing false negatives for missing "middle names" + // 1. Saddam Hussein + query := "saddam hussien" + indexedName := "saddam hussein al tikriti" + score1 := jaroWinkler(indexedName, query) + score2 := bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too low: "+query, score1, 0.656) + eql(t, "New score is better: "+query, score2, 0.924) + + // 2. SDN Entity 7574 "VALENCIA TRUJILLO, Joaquin Mario" + query = "valencia trujillo joaquin" + indexedName = "valencia trujillo joaquin mario" + score1 = jaroWinkler(indexedName, query) + score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too low: "+query, score1, 0.868) + eql(t, "New score is better: "+query, score2, 0.973) + + // 3. SDN Entity 9760 "LUKASHENKO, Alexander Grigoryevich" + query = "alexander lukashenko" + indexedName = "lukashenko alexander grigoryevich" + score1 = jaroWinkler(indexedName, query) + score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too low: "+query, score1, 0.765) + eql(t, "New score is better: "+query, score2, 0.942) + + // Small words had too much weight, causing false negatives + // 1. SDN Entity 4691 "A.I.C. SOGO KENKYUSHO" + query = "sogo kenkyusho" + indexedName = "a i c sogo kenkyusho" + score1 = jaroWinkler(indexedName, query) + score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) + eql(t, "Score is too low: "+query, score1, 0.400) + eql(t, "New score is better: "+query, score2, 0.972) +} diff --git a/cmd/server/search.go b/cmd/server/search.go index e65a826e9..17002d4b6 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -747,11 +747,11 @@ func precomputeDPs(persons []*dpl.DPL, pipe *pipeliner) []*DP { var ( // Jaro-Winkler parameters - boostThreshold = readFloat(os.Getenv("JARO_WINKLER_BOOST_THRESHOLD"), 0.81) + boostThreshold = readFloat(os.Getenv("JARO_WINKLER_BOOST_THRESHOLD"), 0.7) prefixSize = readInt(os.Getenv("JARO_WINKLER_PREFIX_SIZE"), 4) - //cCustomised Jaro-Winkler parameters + // Customised Jaro-Winkler parameters lengthDifferenceCutoffFactor = readFloat(os.Getenv("LENGTH_DIFFERENCE_CUTOFF_FACTOR"), 0.9) - lengthDifferencePenaltyWeight = readFloat(os.Getenv("LENGTH_DIFFERENCE_PENALTY_WEIGHT"), 0.4) + lengthDifferencePenaltyWeight = readFloat(os.Getenv("LENGTH_DIFFERENCE_PENALTY_WEIGHT"), 0.3) differentLetterPenaltyWeight = readFloat(os.Getenv("DIFFERENT_LETTER_PENALTY_WEIGHT"), 0.9) // Watchman parameters diff --git a/cmd/server/search_eu_csl_test.go b/cmd/server/search_eu_csl_test.go index 22422356e..7d192c218 100644 --- a/cmd/server/search_eu_csl_test.go +++ b/cmd/server/search_eu_csl_test.go @@ -28,7 +28,7 @@ func TestSearch__EU_CSL(t *testing.T) { w.Flush() require.Equal(t, http.StatusOK, w.Code) - require.Contains(t, w.Body.String(), `"match":0.65555`) + require.Contains(t, w.Body.String(), `"match":0.92419`) require.Contains(t, w.Body.String(), `"matchedName":"saddam hussein al tikriti"`) var wrapper struct { diff --git a/cmd/server/search_handlers_test.go b/cmd/server/search_handlers_test.go index 05973fa70..db9ee0af5 100644 --- a/cmd/server/search_handlers_test.go +++ b/cmd/server/search_handlers_test.go @@ -269,7 +269,7 @@ func TestSearch__Name(t *testing.T) { w.Flush() require.Equal(t, http.StatusOK, w.Code) - require.Contains(t, w.Body.String(), `"match":0.89166`) + require.Contains(t, w.Body.String(), `"match":0.95588`) require.Contains(t, w.Body.String(), `"matchedName":"dr ayman al zawahiri"`) var wrapper struct { @@ -319,7 +319,7 @@ func TestSearch__AltName(t *testing.T) { } require.Equal(t, http.StatusOK, w.Code) - require.Contains(t, w.Body.String(), `"match":0.5`) + require.Contains(t, w.Body.String(), `"match":0.98`) require.Contains(t, w.Body.String(), `"matchedName":"i c sogo kenkyusho"`) var wrapper struct { diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go index 913ce46a4..90ea38ad6 100644 --- a/cmd/server/search_test.go +++ b/cmd/server/search_test.go @@ -581,9 +581,9 @@ func TestSearch_liveData(t *testing.T) { name string match float64 // top match % }{ - {"Nicolas MADURO", 0.910}, - {"nicolas maduro", 0.910}, - {"NICOLAS maduro", 0.910}, + {"Nicolas MADURO", 0.958}, + {"nicolas maduro", 0.958}, + {"NICOLAS maduro", 0.958}, } keeper := keepSDN(filterRequest{}) @@ -753,7 +753,7 @@ func TestSearch__TopSDNs(t *testing.T) { if len(sdns) == 0 { t.Fatal("empty SDNs") } - require.Equal(t, "2681", sdns[0].EntityID) + require.Equal(t, "2676", sdns[0].EntityID) } func TestSearch__TopDPs(t *testing.T) { diff --git a/cmd/server/search_us_csl_test.go b/cmd/server/search_us_csl_test.go index 3f04f600a..f05a878a8 100644 --- a/cmd/server/search_us_csl_test.go +++ b/cmd/server/search_us_csl_test.go @@ -30,8 +30,8 @@ func TestSearch_US_CSL(t *testing.T) { w.Flush() require.Equal(t, http.StatusOK, w.Code) - require.Contains(t, w.Body.String(), `"match":0.6333`) - require.Contains(t, w.Body.String(), `"matchedName":"zaman"`) + require.Contains(t, w.Body.String(), `"match":0.89`) + require.Contains(t, w.Body.String(), `"matchedName":"abdul qadeer khan"`) var wrapper struct { NonProliferationSanctions []csl.ISN `json:"nonProliferationSanctions"` @@ -76,7 +76,7 @@ func TestSearcher_TopMEUs(t *testing.T) { require.Len(t, meus, 1) require.Equal(t, "d54346ef81802673c1b1daeb2ca8bd5d13755abd", meus[0].Data.EntityID) - require.Equal(t, "0.70597", fmt.Sprintf("%.5f", meus[0].match)) + require.Equal(t, "0.88750", fmt.Sprintf("%.5f", meus[0].match)) } func TestSearcher_TopSSIs(t *testing.T) { @@ -120,7 +120,7 @@ func TestSearcher_TopISNs(t *testing.T) { isn := isns[0] require.Equal(t, "2d2db09c686e4829d0ef1b0b04145eec3d42cd88", isn.Data.EntityID) - require.Equal(t, "0.92", fmt.Sprintf("%.2f", isn.match)) + require.Equal(t, "0.93", fmt.Sprintf("%.2f", isn.match)) } func TestSearcher_TopUVLs(t *testing.T) { From 8ddbe1629c817a4e01bf3c5a004a82c971158802 Mon Sep 17 00:00:00 2001 From: Tom Daffurn Date: Fri, 8 Dec 2023 15:28:39 +1000 Subject: [PATCH 3/7] Update `TestJaroWinkler` to use best-pairs algo --- cmd/server/search_test.go | 124 +++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go index 90ea38ad6..60eaa70ac 100644 --- a/cmd/server/search_test.go +++ b/cmd/server/search_test.go @@ -429,110 +429,110 @@ func verifyDownloadStats(b *testing.B) { func TestJaroWinkler(t *testing.T) { cases := []struct { - s1, s2 string - match float64 + indexed, search string + match float64 }{ // examples - {"wei, zhao", "wei, Zhao", 0.917}, + {"wei, zhao", "wei, Zhao", 0.875}, {"WEI, Zhao", "WEI, Zhao", 1.0}, {"WEI Zhao", "WEI Zhao", 1.0}, {strings.ToLower("WEI Zhao"), precompute("WEI, Zhao"), 1.0}, // apply jaroWinkler in both directions - {"jane doe", "jan lahore", 0.621}, - {"jan lahore", "jane doe", 0.776}, + {"jane doe", "jan lahore", 0.596}, + {"jan lahore", "jane doe", 0.596}, // real world case - {"john doe", "paul john", 0.764}, - {"john doe", "john othername", 0.618}, + {"john doe", "paul john", 0.533}, + {"john doe", "john othername", 0.672}, // close match - {"jane doe", "jane doe2", 0.971}, + {"jane doe", "jane doe2", 0.940}, // real-ish world examples - {"kalamity linden", "kala limited", 0.771}, - {"kala limited", "kalamity linden", 0.602}, + {"kalamity linden", "kala limited", 0.687}, + {"kala limited", "kalamity linden", 0.687}, // examples used in demos / commonly {"nicolas", "nicolas", 1.0}, - {"nicolas moros maduro", "nicolas maduro", 0.91}, - {"nicolas maduro", "nicolas moros maduro", 1.0}, + {"nicolas moros maduro", "nicolas maduro", 0.958}, + {"nicolas maduro", "nicolas moros maduro", 0.839}, // customer examples - {"ian", "ian mckinley", 0.9}, - {"iap", "ian mckinley", 0.411}, - {"ian mckinley", "ian", 0.819}, - {"ian mckinley", "iap", 0.654}, - {"ian mckinley", "tian xiang 7", 0.5}, - {"bindaree food group pty", precompute("independent insurance group ltd"), 0.659}, // precompute removes ltd - {"bindaree food group pty ltd", "independent insurance group ltd", 0.728}, // only matches higher from 'ltd' - {"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.602}, - {"zincum llc", "easy verification inc.", 0.426}, - {"transpetrochart co ltd", "jx metals trading co.", 0.544}, - {"technolab", "moomoo technologies inc", 0.291}, - {"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.247}, - {"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0}, + {"ian", "ian mckinley", 0.429}, + {"iap", "ian mckinley", 0.352}, + {"ian mckinley", "ian", 0.891}, + {"ian mckinley", "iap", 0.733}, + {"ian mckinley", "tian xiang 7", 0.526}, + {"bindaree food group pty", precompute("independent insurance group ltd"), 0.576}, // precompute removes ltd + {"bindaree food group pty ltd", "independent insurance group ltd", 0.631}, // only matches higher from 'ltd' + {"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.658}, + {"zincum llc", "easy verification inc.", 0.380}, + {"transpetrochart co ltd", "jx metals trading co.", 0.496}, + {"technolab", "moomoo technologies inc", 0.565}, + {"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.480}, + {"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.094}, // example cases - {"nicolas maduro", "nicolás maduro", 0.961}, + {"nicolas maduro", "nicolás maduro", 0.937}, {"nicolas maduro", precompute("nicolás maduro"), 1.0}, - {"nic maduro", "nicolas maduro", 0.717}, - {"nick maduro", "nicolas maduro", 0.769}, - {"nicolas maduroo", "nicolas maduro", 0.986}, + {"nic maduro", "nicolas maduro", 0.872}, + {"nick maduro", "nicolas maduro", 0.859}, + {"nicolas maduroo", "nicolas maduro", 0.966}, {"nicolas maduro", "nicolas maduro", 1.0}, {"maduro, nicolas", "maduro, nicolas", 1.0}, {"maduro moros, nicolas", "maduro moros, nicolas", 1.0}, - {"maduro moros, nicolas", "nicolas maduro", 0.889}, - {"nicolas maduro moros", "maduro", 0.722}, - {"nicolas maduro moros", "nicolás maduro", 0.884}, - {"nicolas, maduro moros", "maduro", 0.720}, - {"nicolas, maduro moros", "nicolas maduro", 0.902}, - {"nicolas, maduro moros", "nicolás", 0.554}, - {"nicolas, maduro moros", "maduro", 0.720}, - {"nicolas, maduro moros", "nicolás maduro", 0.877}, - {"africada financial services bureau change", "skylight", 0.266}, - {"africada financial services bureau change", "skylight financial inc", 0.596}, - {"africada financial services bureau change", "skylight services inc", 0.645}, - {"africada financial services bureau change", "skylight financial services", 0.67}, - {"africada financial services bureau change", "skylight financial services inc", 0.696}, + {"maduro moros, nicolas", "nicolas maduro", 0.953}, + {"nicolas maduro moros", "maduro", 0.900}, + {"nicolas maduro moros", "nicolás maduro", 0.898}, + {"nicolas, maduro moros", "maduro", 0.897}, + {"nicolas, maduro moros", "nicolas maduro", 0.928}, + {"nicolas, maduro moros", "nicolás", 0.822}, + {"nicolas, maduro moros", "maduro", 0.897}, + {"nicolas, maduro moros", "nicolás maduro", 0.906}, + {"africada financial services bureau change", "skylight", 0.441}, + {"africada financial services bureau change", "skylight financial inc", 0.658}, + {"africada financial services bureau change", "skylight services inc", 0.621}, + {"africada financial services bureau change", "skylight financial services", 0.761}, + {"africada financial services bureau change", "skylight financial services inc", 0.730}, // stopwords tests - {"the group for the preservation of the holy sites", "the bridgespan group", 0.448}, - {precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.448}, - {"group preservation holy sites", "bridgespan group", 0.619}, + {"the group for the preservation of the holy sites", "the bridgespan group", 0.682}, + {precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.682}, + {"group preservation holy sites", "bridgespan group", 0.652}, - {"the group for the preservation of the holy sites", "the logan group", 0.424}, - {precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.424}, - {"group preservation holy sites", "logan group", 0.478}, + {"the group for the preservation of the holy sites", "the logan group", 0.730}, + {precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.730}, + {"group preservation holy sites", "logan group", 0.649}, - {"the group for the preservation of the holy sites", "the anything group", 0.437}, - {precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.437}, + {"the group for the preservation of the holy sites", "the anything group", 0.698}, + {precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.698}, {"group preservation holy sites", "anything group", 0.585}, - {"the group for the preservation of the holy sites", "the hello world group", 0.47}, - {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.47}, - {"group preservation holy sites", "hello world group", 0.515}, + {"the group for the preservation of the holy sites", "the hello world group", 0.706}, + {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.706}, + {"group preservation holy sites", "hello world group", 0.560}, - {"the group for the preservation of the holy sites", "the group", 0.416}, - {precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.416}, - {"group preservation holy sites", "group", 0.460}, + {"the group for the preservation of the holy sites", "the group", 0.880}, + {precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.880}, + {"group preservation holy sites", "group", 0.879}, - {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.403}, + {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.426}, { precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), - 0.459, + 0.446, }, - {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.239}, + {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.334}, // precompute - {"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.5}, - {precompute("A.I.C. SOGO KENKYUSHO"), "sogo kenkyusho", 0.667}, + {"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.858}, + {precompute("A.I.C. SOGO KENKYUSHO"), "sogo kenkyusho", 0.972}, } for i := range cases { v := cases[i] // Only need to call chomp on s1, see jaroWinkler doc - eql(t, fmt.Sprintf("#%d %s vs %s", i, v.s1, v.s2), jaroWinkler(v.s1, v.s2), v.match) + eql(t, fmt.Sprintf("#%d %s vs %s", i, v.indexed, v.search), bestPairsJaroWinkler(strings.Fields(v.search), v.indexed), v.match) } } From 19bae9299933f3c611e22121008c6b354a95b5d9 Mon Sep 17 00:00:00 2001 From: Tom Daffurn Date: Mon, 11 Dec 2023 10:44:19 +1000 Subject: [PATCH 4/7] Clean up test --- cmd/server/new_algorithm_test.go | 96 ++++++++++++-------------------- 1 file changed, 36 insertions(+), 60 deletions(-) diff --git a/cmd/server/new_algorithm_test.go b/cmd/server/new_algorithm_test.go index b3945ade1..300672285 100644 --- a/cmd/server/new_algorithm_test.go +++ b/cmd/server/new_algorithm_test.go @@ -13,88 +13,64 @@ func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) { // Words in the query should be matched against at most one indexed word. Doubled names on the sanctioned list can // skew results // 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich - query := "vladimir levenshtein" - indexedName := "vladimirov vladimir vladimirovich" - score1 := jaroWinkler(indexedName, query) - score2 := bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too high: "+query, score1, 0.961) - eql(t, "New score is better: "+query, score2, 0.603) + oldScore, newScore := compareAlgorithms("vladimirov vladimir vladimirovich", "vladimir levenshtein") + eql(t, "Score is too high", oldScore, 0.961) + eql(t, "New score is better", newScore, 0.603) // 2. SDN Entity 7788 "SHAQIRI, Shaqir" - query = "zaid shakir" - indexedName = "shaqiri shaqir" - score1 = jaroWinkler(indexedName, query) - score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too high: "+query, score1, 0.908) - eql(t, "New score is better: "+query, score2, 0.704) + oldScore, newScore = compareAlgorithms("shaqiri shaqir", "zaid shakir") + eql(t, "Score is too high", oldScore, 0.908) + eql(t, "New score is better", newScore, 0.704) // Single-word sanctioned names shouldn't match any query with that name part // 1. SDN Entity 15050 "HADI" - query = "hadi alwai" - indexedName = "hadi" - score1 = jaroWinkler(indexedName, query) - score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too high: "+query, score1, 0.900) - eql(t, "New score is better: "+query, score2, 0.615) + oldScore, newScore = compareAlgorithms("hadi", "hadi alwai") + eql(t, "Score is too high", oldScore, 0.900) + eql(t, "New score is better", newScore, 0.615) // Name-part scores should be weighted by the character length. If not, small words can have unfair weight // 1. SDN Entity "LI, Shangfu" - query = "li shanlan" - indexedName = "li shangfu" - score1 = jaroWinkler(indexedName, query) - score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too high: "+query, score1, 0.914) - eql(t, "New score is better: "+query, score2, 0.867) + oldScore, newScore = compareAlgorithms("li shangfu", "li shanlan") + eql(t, "Score is too high", oldScore, 0.914) + eql(t, "New score is better", newScore, 0.867) // Words with different lengths shouldn't match very highly - query = "brown" - indexedName = "browningweight" - score1 = jaroWinkler(indexedName, query) - score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too high: "+query, score1, 0.871) - eql(t, "New score is better: "+query, score2, 0.703) + oldScore, newScore = compareAlgorithms("browningweight", "brown") + eql(t, "Score is too high", oldScore, 0.871) + eql(t, "New score is better", newScore, 0.703) // Words that start with different letters shouldn't match very highly - query = "jimenez" - indexedName = "dominguez" - score1 = jaroWinkler(indexedName, query) - score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too high: "+query, score1, 0.690) - eql(t, "New score is better: "+query, score2, 0.580) + oldScore, newScore = compareAlgorithms("dominguez", "jimenez") + eql(t, "Score is too high", oldScore, 0.690) + eql(t, "New score is better", newScore, 0.580) } func TestBestPairsJaroWinkler__TruePositives(t *testing.T) { // Unmatched indexed words had a large weight, causing false negatives for missing "middle names" // 1. Saddam Hussein - query := "saddam hussien" - indexedName := "saddam hussein al tikriti" - score1 := jaroWinkler(indexedName, query) - score2 := bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too low: "+query, score1, 0.656) - eql(t, "New score is better: "+query, score2, 0.924) + oldScore, newScore := compareAlgorithms("saddam hussein al tikriti", "saddam hussien") + eql(t, "Score is too low", oldScore, 0.656) + eql(t, "New score is better", newScore, 0.924) // 2. SDN Entity 7574 "VALENCIA TRUJILLO, Joaquin Mario" - query = "valencia trujillo joaquin" - indexedName = "valencia trujillo joaquin mario" - score1 = jaroWinkler(indexedName, query) - score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too low: "+query, score1, 0.868) - eql(t, "New score is better: "+query, score2, 0.973) + oldScore, newScore = compareAlgorithms("valencia trujillo joaquin mario", "valencia trujillo joaquin") + eql(t, "Score is too low", oldScore, 0.868) + eql(t, "New score is better", newScore, 0.973) // 3. SDN Entity 9760 "LUKASHENKO, Alexander Grigoryevich" - query = "alexander lukashenko" - indexedName = "lukashenko alexander grigoryevich" - score1 = jaroWinkler(indexedName, query) - score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too low: "+query, score1, 0.765) - eql(t, "New score is better: "+query, score2, 0.942) + oldScore, newScore = compareAlgorithms("lukashenko alexander grigoryevich", "alexander lukashenko") + eql(t, "Score is too low", oldScore, 0.765) + eql(t, "New score is better", newScore, 0.942) // Small words had too much weight, causing false negatives // 1. SDN Entity 4691 "A.I.C. SOGO KENKYUSHO" - query = "sogo kenkyusho" - indexedName = "a i c sogo kenkyusho" - score1 = jaroWinkler(indexedName, query) - score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName) - eql(t, "Score is too low: "+query, score1, 0.400) - eql(t, "New score is better: "+query, score2, 0.972) + oldScore, newScore = compareAlgorithms("a i c sogo kenkyusho", "sogo kenkyusho") + eql(t, "Score is too low", oldScore, 0.400) + eql(t, "New score is better", newScore, 0.972) +} + +func compareAlgorithms(indexedName string, query string) (float64, float64) { + oldScore := jaroWinkler(indexedName, query) + newScore := bestPairsJaroWinkler(strings.Fields(query), indexedName) + return oldScore, newScore } From 336b96c9d4a6a74abfc2a7be3820bf87a475c8f6 Mon Sep 17 00:00:00 2001 From: Tom Daffurn Date: Wed, 13 Dec 2023 09:09:19 +1000 Subject: [PATCH 5/7] Fix linter issue --- cmd/server/search.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/server/search.go b/cmd/server/search.go index 17002d4b6..1260d9f36 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -330,7 +330,7 @@ func bestPairsJaroWinkler(searchTokens []string, indexed string) float64 { totalWeightedScores := 0.0 for _, score := range scores { //If neither the search token nor index token have been matched so far - if matchedSearchTokens[score.searchTokenIdx] == false && matchedIndexTokens[score.indexTokenIdx] == false { + if !matchedSearchTokens[score.searchTokenIdx] && !matchedIndexTokens[score.indexTokenIdx] { //Weight the importance of this word score by its character length searchToken := searchTokens[score.searchTokenIdx] indexToken := indexedTokens[score.indexTokenIdx] From b99f1e3523cc00e9330a78b2d1d4ea48afdfd92d Mon Sep 17 00:00:00 2001 From: Tom Daffurn Date: Wed, 13 Dec 2023 09:09:36 +1000 Subject: [PATCH 6/7] Update config settings --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2c7dd7968..80033f06a 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ You should get this response: PONG ``` -### Configuration settings +### Configuration settings | Environmental Variable | Description | Default | |-----|-----|-----| @@ -192,6 +192,10 @@ PONG | `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty | | `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 | | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 | +| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9 | +| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3 | +| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9 | +| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15 | | `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 | | `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 | | `WEBHOOK_BATCH_SIZE` | How many watches to read from database per batch of async searches. | 100 | From ddb46e7f2f4d950c329730bec4c76d9e39329755 Mon Sep 17 00:00:00 2001 From: Tom Daffurn Date: Wed, 13 Dec 2023 09:23:44 +1000 Subject: [PATCH 7/7] More linter fixes --- cmd/server/search.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/server/search.go b/cmd/server/search.go index 1260d9f36..aa73c12c4 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -349,7 +349,7 @@ func bestPairsJaroWinkler(searchTokens []string, indexed string) float64 { //unmatched portion, then scale down the final score. matchedIndexLength := 0 for i, str := range indexedTokens { - if matchedIndexTokens[i] == true { + if matchedIndexTokens[i] { matchedIndexLength += len(str) } }