From e68b03dcfc9f8aafef62ae34b5a470c2aa268acb Mon Sep 17 00:00:00 2001
From: Tom Daffurn <tdaffurn@tbd.email>
Date: Thu, 30 Nov 2023 18:21:11 +1000
Subject: [PATCH 1/7] Experimental search algo

---
 cmd/server/search.go         | 171 +++++++++++++++++++++++++++++------
 cmd/server/search_generic.go |   7 +-
 2 files changed, 149 insertions(+), 29 deletions(-)

diff --git a/cmd/server/search.go b/cmd/server/search.go
index d5b2a73a1..e65a826e9 100644
--- a/cmd/server/search.go
+++ b/cmd/server/search.go
@@ -248,6 +248,7 @@ func (s *searcher) FindAlts(limit int, id string) []*ofac.AlternateIdentity {
 
 func (s *searcher) TopAltNames(limit int, minMatch float64, alt string) []Alt {
 	alt = precompute(alt)
+	altTokens := strings.Fields(alt)
 
 	s.RLock()
 	defer s.RUnlock()
@@ -268,7 +269,7 @@ func (s *searcher) TopAltNames(limit int, minMatch float64, alt string) []Alt {
 			xs.add(&item{
 				matched: s.Alts[i].name,
 				value:   s.Alts[i],
-				weight:  jaroWinkler(s.Alts[i].name, alt),
+				weight:  bestPairsJaroWinkler(altTokens, s.Alts[i].name),
 			})
 		}(i)
 	}
@@ -290,6 +291,114 @@ func (s *searcher) TopAltNames(limit int, minMatch float64, alt string) []Alt {
 	return out
 }
 
+// bestPairsJaroWinkler compares a search query to an indexed term (name, address, etc) and returns a decimal fraction
+// score.
+//
+// The algorithm splits each string into tokens, and does a pairwise Jaro-Winkler score of all token combinations
+// (outer product). The best match for each search token is chosen, such that each index token can be matched at most
+// once.
+//
+// The pairwise scores are combined into an average in a way that corrects for character length, and the fraction of the
+// indexed term that didn't match.
+func bestPairsJaroWinkler(searchTokens []string, indexed string) float64 {
+	type Score struct {
+		score          float64
+		searchTokenIdx int
+		indexTokenIdx  int
+	}
+
+	indexedTokens := strings.Fields(indexed)
+	searchTokensLength := sumLength(searchTokens)
+	indexTokensLength := sumLength(indexedTokens)
+
+	//Compare each search token to each indexed token. Sort the results in descending order
+	scores := make([]Score, 0)
+	for searchIdx, searchToken := range searchTokens {
+		for indexIdx, indexedToken := range indexedTokens {
+			score := customJaroWinkler(indexedToken, searchToken)
+			scores = append(scores, Score{score, searchIdx, indexIdx})
+		}
+	}
+	sort.Slice(scores[:], func(i, j int) bool {
+		return scores[i].score > scores[j].score
+	})
+
+	//Pick the highest score for each search term, where the indexed token hasn't yet been matched
+	matchedSearchTokens := make([]bool, len(searchTokens))
+	matchedIndexTokens := make([]bool, len(indexedTokens))
+	matchedIndexTokensLength := 0
+	totalWeightedScores := 0.0
+	for _, score := range scores {
+		//If neither the search token nor index token have been matched so far
+		if matchedSearchTokens[score.searchTokenIdx] == false && matchedIndexTokens[score.indexTokenIdx] == false {
+			//Weight the importance of this word score by its character length
+			searchToken := searchTokens[score.searchTokenIdx]
+			indexToken := indexedTokens[score.indexTokenIdx]
+			totalWeightedScores += score.score * float64(len(searchToken)+len(indexToken))
+
+			matchedSearchTokens[score.searchTokenIdx] = true
+			matchedIndexTokens[score.indexTokenIdx] = true
+			matchedIndexTokensLength += len(indexToken)
+		}
+	}
+	lengthWeightedAverageScore := totalWeightedScores / float64(searchTokensLength+matchedIndexTokensLength)
+
+	//If some index tokens weren't matched by any search token, penalise this search a small amount. If this isn't done,
+	//a query of "John Doe" will match "John Doe" and "John Bartholomew Doe" equally well.
+	//Calculate the fraction of the index name that wasn't matched, apply a weighting to reduce the importance of
+	//unmatched portion, then scale down the final score.
+	matchedIndexLength := 0
+	for i, str := range indexedTokens {
+		if matchedIndexTokens[i] == true {
+			matchedIndexLength += len(str)
+		}
+	}
+	matchedFraction := float64(matchedIndexLength) / float64(indexTokensLength)
+	return lengthWeightedAverageScore * scalingFactor(matchedFraction, unmatchedIndexPenaltyWeight)
+}
+
+func customJaroWinkler(s1 string, s2 string) float64 {
+	score := smetrics.JaroWinkler(s1, s2, boostThreshold, prefixSize)
+
+	if lengthMetric := lengthDifferenceFactor(s1, s2); lengthMetric < lengthDifferenceCutoffFactor {
+		//If there's a big difference in matched token lengths, punish the score. Jaro-Winkler is quite permissive about
+		//different lengths
+		score = score * scalingFactor(lengthMetric, lengthDifferencePenaltyWeight)
+	}
+	if s1[0] != s2[0] {
+		//Penalise words that start with a different characters. Jaro-Winkler is too lenient on this
+		//TODO should use a phonetic comparison here, like Soundex
+		score = score * differentLetterPenaltyWeight
+	}
+	return score
+}
+
+// scalingFactor returns a float [0,1] that can be used to scale another number down, given some metric and a desired
+// weight
+// e.g. If a score has a 50% value according to a metric, and we want a 10% weight to the metric:
+//
+//	scaleFactor := scalingFactor(0.5, 0.1)  // 0.95
+//	scaledScore := score * scaleFactor
+func scalingFactor(metric float64, weight float64) float64 {
+	return 1.0 - (1.0-metric)*weight
+}
+
+func sumLength(strs []string) int {
+	totalLength := 0
+	for _, str := range strs {
+		totalLength += len(str)
+	}
+	return totalLength
+}
+
+func lengthDifferenceFactor(s1 string, s2 string) float64 {
+	ls1 := float64(len(s1))
+	ls2 := float64(len(s2))
+	min := math.Min(ls1, ls2)
+	max := math.Max(ls1, ls2)
+	return min / max
+}
+
 func (s *searcher) FindSDN(entityID string) *ofac.SDN {
 	if sdn := s.debugSDN(entityID); sdn != nil {
 		return sdn.SDN
@@ -365,6 +474,7 @@ func (s *searcher) FindSDNsByRemarksID(limit int, id string) []*SDN {
 
 func (s *searcher) TopSDNs(limit int, minMatch float64, name string, keepSDN func(*SDN) bool) []*SDN {
 	name = precompute(name)
+	nameTokens := strings.Fields(name)
 
 	s.RLock()
 	defer s.RUnlock()
@@ -389,7 +499,7 @@ func (s *searcher) TopSDNs(limit int, minMatch float64, name string, keepSDN fun
 			xs.add(&item{
 				matched: s.SDNs[i].name,
 				value:   s.SDNs[i],
-				weight:  jaroWinkler(s.SDNs[i].name, name),
+				weight:  bestPairsJaroWinkler(nameTokens, s.SDNs[i].name),
 			})
 		}(i)
 	}
@@ -413,6 +523,7 @@ func (s *searcher) TopSDNs(limit int, minMatch float64, name string, keepSDN fun
 
 func (s *searcher) TopDPs(limit int, minMatch float64, name string) []DP {
 	name = precompute(name)
+	nameTokens := strings.Fields(name)
 
 	s.RLock()
 	defer s.RUnlock()
@@ -433,7 +544,7 @@ func (s *searcher) TopDPs(limit int, minMatch float64, name string) []DP {
 			xs.add(&item{
 				matched: s.DPs[i].name,
 				value:   s.DPs[i],
-				weight:  jaroWinkler(s.DPs[i].name, name),
+				weight:  bestPairsJaroWinkler(nameTokens, s.DPs[i].name),
 			})
 		}(i)
 	}
@@ -636,11 +747,16 @@ func precomputeDPs(persons []*dpl.DPL, pipe *pipeliner) []*DP {
 
 var (
 	// Jaro-Winkler parameters
-	boostThreshold = readFloat(os.Getenv("JARO_WINKLER_BOOST_THRESHOLD"), 0.7)
+	boostThreshold = readFloat(os.Getenv("JARO_WINKLER_BOOST_THRESHOLD"), 0.81)
 	prefixSize     = readInt(os.Getenv("JARO_WINKLER_PREFIX_SIZE"), 4)
+	//cCustomised Jaro-Winkler parameters
+	lengthDifferenceCutoffFactor  = readFloat(os.Getenv("LENGTH_DIFFERENCE_CUTOFF_FACTOR"), 0.9)
+	lengthDifferencePenaltyWeight = readFloat(os.Getenv("LENGTH_DIFFERENCE_PENALTY_WEIGHT"), 0.4)
+	differentLetterPenaltyWeight  = readFloat(os.Getenv("DIFFERENT_LETTER_PENALTY_WEIGHT"), 0.9)
 
 	// Watchman parameters
-	exactMatchFavoritism = readFloat(os.Getenv("EXACT_MATCH_FAVORITISM"), 0.0)
+	exactMatchFavoritism        = readFloat(os.Getenv("EXACT_MATCH_FAVORITISM"), 0.0)
+	unmatchedIndexPenaltyWeight = readFloat(os.Getenv("UNMATCHED_INDEX_TOKEN_WEIGHT"), 0.15)
 )
 
 func readFloat(override string, value float64) float64 {
@@ -679,49 +795,50 @@ var (
 	adjacentSimilarityPositions = readInt(os.Getenv("ADJACENT_SIMILARITY_POSITIONS"), 3)
 )
 
-func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
-	maxMatch := func(word string, s1Idx int, parts []string) (float64, string) {
-		if word == "" || len(parts) == 0 {
+func jaroWinklerWithFavoritism(indexedTerm, query string, favoritism float64) float64 {
+	maxMatch := func(indexedWord string, indexedWordIdx int, queryWords []string) (float64, string) {
+		if indexedWord == "" || len(queryWords) == 0 {
 			return 0.0, ""
 		}
 
 		// We're only looking for the highest match close
-		start := s1Idx - adjacentSimilarityPositions
-		end := s1Idx + adjacentSimilarityPositions
+		start := indexedWordIdx - adjacentSimilarityPositions
+		end := indexedWordIdx + adjacentSimilarityPositions
 
 		var max float64
 		var maxTerm string
 		for i := start; i < end; i++ {
-			if i >= 0 && len(parts) > i {
-				score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize)
+			if i >= 0 && len(queryWords) > i {
+				score := smetrics.JaroWinkler(indexedWord, queryWords[i], boostThreshold, prefixSize)
 				if score > max {
 					max = score
-					maxTerm = parts[i]
+					maxTerm = queryWords[i]
 				}
 			}
 		}
 		return max, maxTerm
 	}
 
-	s1Parts, s2Parts := strings.Fields(s1), strings.Fields(s2)
-	if len(s1Parts) == 0 || len(s2Parts) == 0 {
+	indexedWords, queryWords := strings.Fields(indexedTerm), strings.Fields(query)
+	if len(indexedWords) == 0 || len(queryWords) == 0 {
 		return 0.0 // avoid returning NaN later on
 	}
 
 	var scores []float64
-	for i := range s1Parts {
-		max, term := maxMatch(s1Parts[i], i, s2Parts)
+	for i := range indexedWords {
+		max, term := maxMatch(indexedWords[i], i, queryWords)
+		//fmt.Printf("%s maxMatch %s %f\n", indexedWords[i], term, max)
 		if max >= 1.0 {
-			// If the query is longer than our indexed term (and both are longer than most names)
+			// If the query is longer than our indexed term (and EITHER are longer than most names)
 			// we want to reduce the maximum weight proportionally by the term difference, which
 			// forces more terms to match instead of one or two dominating the weight.
-			if (len(s2Parts) > len(s1Parts)) && (len(s1Parts) > 3 || len(s2Parts) > 3) {
-				max *= (float64(len(s1Parts)) / float64(len(s2Parts)))
+			if (len(queryWords) > len(indexedWords)) && (len(indexedWords) > 3 || len(queryWords) > 3) {
+				max *= (float64(len(indexedWords)) / float64(len(queryWords)))
 				goto add
 			}
 			// If the indexed term is really short cap the match at 90%.
 			// This sill allows names to match highly with a couple different characters.
-			if len(s1Parts) < 2 && len(s2Parts) > 1 {
+			if len(indexedWords) == 1 && len(queryWords) > 1 {
 				max *= 0.9
 				goto add
 			}
@@ -734,14 +851,14 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
 			// adjust the max lower by the proportion of different terms.
 			//
 			// We do this to decrease the importance of a short (often common) term.
-			if len(s2Parts) > len(s1Parts) {
-				scores = append(scores, max*float64(len(s1Parts))/float64(len(s2Parts)))
+			if len(queryWords) > len(indexedWords) {
+				scores = append(scores, max*float64(len(indexedWords))/float64(len(queryWords)))
 				continue
 			}
 
 			// Apply an additional weight based on similarity of term lengths,
 			// so terms which are closer in length match higher.
-			s1 := float64(len(s1Parts[i]))
+			s1 := float64(len(indexedWords[i]))
 			t := float64(len(term)) - 1
 			weight := math.Min(math.Abs(s1/t), 1.0)
 
@@ -749,11 +866,11 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
 		}
 	}
 
-	// average the highest N scores where N is the words in our query (s2).
+	// average the highest N scores where N is the words in our query (query).
 	// Only truncate scores if there are enough words (aka more than First/Last).
 	sort.Float64s(scores)
-	if len(s1Parts) > len(s2Parts) && len(s2Parts) > 5 {
-		scores = scores[len(s1Parts)-len(s2Parts):]
+	if len(indexedWords) > len(queryWords) && len(queryWords) > 5 {
+		scores = scores[len(indexedWords)-len(queryWords):]
 	}
 
 	var sum float64
diff --git a/cmd/server/search_generic.go b/cmd/server/search_generic.go
index 8a1d2e5fd..df977fb1d 100644
--- a/cmd/server/search_generic.go
+++ b/cmd/server/search_generic.go
@@ -7,6 +7,7 @@ package main
 import (
 	"encoding/json"
 	"reflect"
+	"strings"
 	"sync"
 )
 
@@ -52,6 +53,8 @@ func topResults[T any](limit int, minMatch float64, name string, data []*Result[
 	}
 
 	name = precompute(name)
+	nameTokens := strings.Fields(name)
+
 	xs := newLargest(limit, minMatch)
 
 	var wg sync.WaitGroup
@@ -64,7 +67,7 @@ func topResults[T any](limit int, minMatch float64, name string, data []*Result[
 			it := &item{
 				matched: data[i].precomputedName,
 				value:   data[i],
-				weight:  jaroWinkler(data[i].precomputedName, name),
+				weight:  bestPairsJaroWinkler(nameTokens, data[i].precomputedName),
 			}
 
 			for _, alt := range data[i].precomputedAlts {
@@ -72,7 +75,7 @@ func topResults[T any](limit int, minMatch float64, name string, data []*Result[
 					continue
 				}
 
-				score := jaroWinkler(alt, name)
+				score := bestPairsJaroWinkler(nameTokens, alt)
 				if score > it.weight {
 					it.matched = alt
 					it.weight = score

From df9ce902ee9adfce3195cabc99a18de877b166c6 Mon Sep 17 00:00:00 2001
From: Tom Daffurn <tdaffurn@tbd.email>
Date: Fri, 8 Dec 2023 14:08:37 +1000
Subject: [PATCH 2/7] Add tests and fix tests broken by changing scores

---
 cmd/server/issue115_test.go        |   6 +-
 cmd/server/new_algorithm_test.go   | 100 +++++++++++++++++++++++++++++
 cmd/server/search.go               |   6 +-
 cmd/server/search_eu_csl_test.go   |   2 +-
 cmd/server/search_handlers_test.go |   4 +-
 cmd/server/search_test.go          |   8 +--
 cmd/server/search_us_csl_test.go   |   8 +--
 7 files changed, 117 insertions(+), 17 deletions(-)
 create mode 100644 cmd/server/new_algorithm_test.go

diff --git a/cmd/server/issue115_test.go b/cmd/server/issue115_test.go
index f21b944d1..fb53e158d 100644
--- a/cmd/server/issue115_test.go
+++ b/cmd/server/issue115_test.go
@@ -29,13 +29,13 @@ func TestIssue115__TopSDNs(t *testing.T) {
 	s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "2680", SDNName: "HABBASH, George", SDNType: "INDIVIDUAL"}}, nil, pipe)
 
 	out := s.TopSDNs(1, 0.00, "george bush", keeper)
-	eql(t, "issue115: top SDN 2680", out[0].match, 0.732)
+	eql(t, "issue115: top SDN 2680", out[0].match, 0.687)
 
 	// was 88.3% match
 	s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "9432", SDNName: "CHIWESHE, George", SDNType: "INDIVIDUAL"}}, nil, pipe)
 
 	out = s.TopSDNs(1, 0.00, "george bush", keeper)
-	eql(t, "issue115: top SDN 18996", out[0].match, 0.764)
+	eql(t, "issue115: top SDN 18996", out[0].match, 0.650)
 
 	// another example
 	s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "0", SDNName: "Bush, George W", SDNType: "INDIVIDUAL"}}, nil, pipe)
@@ -47,5 +47,5 @@ func TestIssue115__TopSDNs(t *testing.T) {
 	eql(t, "issue115: top SDN 0", out[0].match, 1.0)
 
 	out = s.TopSDNs(1, 0.00, "george bush", keeper)
-	eql(t, "issue115: top SDN 0", out[0].match, 0.667)
+	eql(t, "issue115: top SDN 0", out[0].match, 0.986)
 }
diff --git a/cmd/server/new_algorithm_test.go b/cmd/server/new_algorithm_test.go
new file mode 100644
index 000000000..b3945ade1
--- /dev/null
+++ b/cmd/server/new_algorithm_test.go
@@ -0,0 +1,100 @@
+// Copyright 2022 The Moov Authors
+// Use of this source code is governed by an Apache License
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) {
+	// Words in the query should be matched against at most one indexed word. Doubled names on the sanctioned list can
+	// skew results
+	// 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich
+	query := "vladimir levenshtein"
+	indexedName := "vladimirov vladimir vladimirovich"
+	score1 := jaroWinkler(indexedName, query)
+	score2 := bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too high: "+query, score1, 0.961)
+	eql(t, "New score is better: "+query, score2, 0.603)
+
+	// 2. SDN Entity 7788 "SHAQIRI, Shaqir"
+	query = "zaid shakir"
+	indexedName = "shaqiri shaqir"
+	score1 = jaroWinkler(indexedName, query)
+	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too high: "+query, score1, 0.908)
+	eql(t, "New score is better: "+query, score2, 0.704)
+
+	// Single-word sanctioned names shouldn't match any query with that name part
+	// 1. SDN Entity 15050 "HADI"
+	query = "hadi alwai"
+	indexedName = "hadi"
+	score1 = jaroWinkler(indexedName, query)
+	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too high: "+query, score1, 0.900)
+	eql(t, "New score is better: "+query, score2, 0.615)
+
+	// Name-part scores should be weighted by the character length. If not, small words can have unfair weight
+	// 1. SDN Entity "LI, Shangfu"
+	query = "li shanlan"
+	indexedName = "li shangfu"
+	score1 = jaroWinkler(indexedName, query)
+	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too high: "+query, score1, 0.914)
+	eql(t, "New score is better: "+query, score2, 0.867)
+
+	// Words with different lengths shouldn't match very highly
+	query = "brown"
+	indexedName = "browningweight"
+	score1 = jaroWinkler(indexedName, query)
+	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too high: "+query, score1, 0.871)
+	eql(t, "New score is better: "+query, score2, 0.703)
+
+	// Words that start with different letters shouldn't match very highly
+	query = "jimenez"
+	indexedName = "dominguez"
+	score1 = jaroWinkler(indexedName, query)
+	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too high: "+query, score1, 0.690)
+	eql(t, "New score is better: "+query, score2, 0.580)
+}
+
+func TestBestPairsJaroWinkler__TruePositives(t *testing.T) {
+	// Unmatched indexed words had a large weight, causing false negatives for missing "middle names"
+	// 1. Saddam Hussein
+	query := "saddam hussien"
+	indexedName := "saddam hussein al tikriti"
+	score1 := jaroWinkler(indexedName, query)
+	score2 := bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too low: "+query, score1, 0.656)
+	eql(t, "New score is better: "+query, score2, 0.924)
+
+	// 2. SDN Entity 7574 "VALENCIA TRUJILLO, Joaquin Mario"
+	query = "valencia trujillo joaquin"
+	indexedName = "valencia trujillo joaquin mario"
+	score1 = jaroWinkler(indexedName, query)
+	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too low: "+query, score1, 0.868)
+	eql(t, "New score is better: "+query, score2, 0.973)
+
+	// 3. SDN Entity 9760 "LUKASHENKO, Alexander Grigoryevich"
+	query = "alexander lukashenko"
+	indexedName = "lukashenko alexander grigoryevich"
+	score1 = jaroWinkler(indexedName, query)
+	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too low: "+query, score1, 0.765)
+	eql(t, "New score is better: "+query, score2, 0.942)
+
+	// Small words had too much weight, causing false negatives
+	// 1. SDN Entity 4691 "A.I.C. SOGO KENKYUSHO"
+	query = "sogo kenkyusho"
+	indexedName = "a i c sogo kenkyusho"
+	score1 = jaroWinkler(indexedName, query)
+	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	eql(t, "Score is too low: "+query, score1, 0.400)
+	eql(t, "New score is better: "+query, score2, 0.972)
+}
diff --git a/cmd/server/search.go b/cmd/server/search.go
index e65a826e9..17002d4b6 100644
--- a/cmd/server/search.go
+++ b/cmd/server/search.go
@@ -747,11 +747,11 @@ func precomputeDPs(persons []*dpl.DPL, pipe *pipeliner) []*DP {
 
 var (
 	// Jaro-Winkler parameters
-	boostThreshold = readFloat(os.Getenv("JARO_WINKLER_BOOST_THRESHOLD"), 0.81)
+	boostThreshold = readFloat(os.Getenv("JARO_WINKLER_BOOST_THRESHOLD"), 0.7)
 	prefixSize     = readInt(os.Getenv("JARO_WINKLER_PREFIX_SIZE"), 4)
-	//cCustomised Jaro-Winkler parameters
+	// Customised Jaro-Winkler parameters
 	lengthDifferenceCutoffFactor  = readFloat(os.Getenv("LENGTH_DIFFERENCE_CUTOFF_FACTOR"), 0.9)
-	lengthDifferencePenaltyWeight = readFloat(os.Getenv("LENGTH_DIFFERENCE_PENALTY_WEIGHT"), 0.4)
+	lengthDifferencePenaltyWeight = readFloat(os.Getenv("LENGTH_DIFFERENCE_PENALTY_WEIGHT"), 0.3)
 	differentLetterPenaltyWeight  = readFloat(os.Getenv("DIFFERENT_LETTER_PENALTY_WEIGHT"), 0.9)
 
 	// Watchman parameters
diff --git a/cmd/server/search_eu_csl_test.go b/cmd/server/search_eu_csl_test.go
index 22422356e..7d192c218 100644
--- a/cmd/server/search_eu_csl_test.go
+++ b/cmd/server/search_eu_csl_test.go
@@ -28,7 +28,7 @@ func TestSearch__EU_CSL(t *testing.T) {
 	w.Flush()
 
 	require.Equal(t, http.StatusOK, w.Code)
-	require.Contains(t, w.Body.String(), `"match":0.65555`)
+	require.Contains(t, w.Body.String(), `"match":0.92419`)
 	require.Contains(t, w.Body.String(), `"matchedName":"saddam hussein al tikriti"`)
 
 	var wrapper struct {
diff --git a/cmd/server/search_handlers_test.go b/cmd/server/search_handlers_test.go
index 05973fa70..db9ee0af5 100644
--- a/cmd/server/search_handlers_test.go
+++ b/cmd/server/search_handlers_test.go
@@ -269,7 +269,7 @@ func TestSearch__Name(t *testing.T) {
 	w.Flush()
 
 	require.Equal(t, http.StatusOK, w.Code)
-	require.Contains(t, w.Body.String(), `"match":0.89166`)
+	require.Contains(t, w.Body.String(), `"match":0.95588`)
 	require.Contains(t, w.Body.String(), `"matchedName":"dr ayman al zawahiri"`)
 
 	var wrapper struct {
@@ -319,7 +319,7 @@ func TestSearch__AltName(t *testing.T) {
 	}
 
 	require.Equal(t, http.StatusOK, w.Code)
-	require.Contains(t, w.Body.String(), `"match":0.5`)
+	require.Contains(t, w.Body.String(), `"match":0.98`)
 	require.Contains(t, w.Body.String(), `"matchedName":"i c sogo kenkyusho"`)
 
 	var wrapper struct {
diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go
index 913ce46a4..90ea38ad6 100644
--- a/cmd/server/search_test.go
+++ b/cmd/server/search_test.go
@@ -581,9 +581,9 @@ func TestSearch_liveData(t *testing.T) {
 		name  string
 		match float64 // top match %
 	}{
-		{"Nicolas MADURO", 0.910},
-		{"nicolas maduro", 0.910},
-		{"NICOLAS maduro", 0.910},
+		{"Nicolas MADURO", 0.958},
+		{"nicolas maduro", 0.958},
+		{"NICOLAS maduro", 0.958},
 	}
 
 	keeper := keepSDN(filterRequest{})
@@ -753,7 +753,7 @@ func TestSearch__TopSDNs(t *testing.T) {
 	if len(sdns) == 0 {
 		t.Fatal("empty SDNs")
 	}
-	require.Equal(t, "2681", sdns[0].EntityID)
+	require.Equal(t, "2676", sdns[0].EntityID)
 }
 
 func TestSearch__TopDPs(t *testing.T) {
diff --git a/cmd/server/search_us_csl_test.go b/cmd/server/search_us_csl_test.go
index 3f04f600a..f05a878a8 100644
--- a/cmd/server/search_us_csl_test.go
+++ b/cmd/server/search_us_csl_test.go
@@ -30,8 +30,8 @@ func TestSearch_US_CSL(t *testing.T) {
 	w.Flush()
 
 	require.Equal(t, http.StatusOK, w.Code)
-	require.Contains(t, w.Body.String(), `"match":0.6333`)
-	require.Contains(t, w.Body.String(), `"matchedName":"zaman"`)
+	require.Contains(t, w.Body.String(), `"match":0.89`)
+	require.Contains(t, w.Body.String(), `"matchedName":"abdul qadeer khan"`)
 
 	var wrapper struct {
 		NonProliferationSanctions []csl.ISN `json:"nonProliferationSanctions"`
@@ -76,7 +76,7 @@ func TestSearcher_TopMEUs(t *testing.T) {
 	require.Len(t, meus, 1)
 
 	require.Equal(t, "d54346ef81802673c1b1daeb2ca8bd5d13755abd", meus[0].Data.EntityID)
-	require.Equal(t, "0.70597", fmt.Sprintf("%.5f", meus[0].match))
+	require.Equal(t, "0.88750", fmt.Sprintf("%.5f", meus[0].match))
 }
 
 func TestSearcher_TopSSIs(t *testing.T) {
@@ -120,7 +120,7 @@ func TestSearcher_TopISNs(t *testing.T) {
 
 	isn := isns[0]
 	require.Equal(t, "2d2db09c686e4829d0ef1b0b04145eec3d42cd88", isn.Data.EntityID)
-	require.Equal(t, "0.92", fmt.Sprintf("%.2f", isn.match))
+	require.Equal(t, "0.93", fmt.Sprintf("%.2f", isn.match))
 }
 
 func TestSearcher_TopUVLs(t *testing.T) {

From 8ddbe1629c817a4e01bf3c5a004a82c971158802 Mon Sep 17 00:00:00 2001
From: Tom Daffurn <tdaffurn@tbd.email>
Date: Fri, 8 Dec 2023 15:28:39 +1000
Subject: [PATCH 3/7] Update `TestJaroWinkler` to use best-pairs algo

---
 cmd/server/search_test.go | 124 +++++++++++++++++++-------------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go
index 90ea38ad6..60eaa70ac 100644
--- a/cmd/server/search_test.go
+++ b/cmd/server/search_test.go
@@ -429,110 +429,110 @@ func verifyDownloadStats(b *testing.B) {
 
 func TestJaroWinkler(t *testing.T) {
 	cases := []struct {
-		s1, s2 string
-		match  float64
+		indexed, search string
+		match           float64
 	}{
 		// examples
-		{"wei, zhao", "wei, Zhao", 0.917},
+		{"wei, zhao", "wei, Zhao", 0.875},
 		{"WEI, Zhao", "WEI, Zhao", 1.0},
 		{"WEI Zhao", "WEI Zhao", 1.0},
 		{strings.ToLower("WEI Zhao"), precompute("WEI, Zhao"), 1.0},
 
 		// apply jaroWinkler in both directions
-		{"jane doe", "jan lahore", 0.621},
-		{"jan lahore", "jane doe", 0.776},
+		{"jane doe", "jan lahore", 0.596},
+		{"jan lahore", "jane doe", 0.596},
 
 		// real world case
-		{"john doe", "paul john", 0.764},
-		{"john doe", "john othername", 0.618},
+		{"john doe", "paul john", 0.533},
+		{"john doe", "john othername", 0.672},
 
 		// close match
-		{"jane doe", "jane doe2", 0.971},
+		{"jane doe", "jane doe2", 0.940},
 
 		// real-ish world examples
-		{"kalamity linden", "kala limited", 0.771},
-		{"kala limited", "kalamity linden", 0.602},
+		{"kalamity linden", "kala limited", 0.687},
+		{"kala limited", "kalamity linden", 0.687},
 
 		// examples used in demos / commonly
 		{"nicolas", "nicolas", 1.0},
-		{"nicolas moros maduro", "nicolas maduro", 0.91},
-		{"nicolas maduro", "nicolas moros maduro", 1.0},
+		{"nicolas moros maduro", "nicolas maduro", 0.958},
+		{"nicolas maduro", "nicolas moros maduro", 0.839},
 
 		// customer examples
-		{"ian", "ian mckinley", 0.9},
-		{"iap", "ian mckinley", 0.411},
-		{"ian mckinley", "ian", 0.819},
-		{"ian mckinley", "iap", 0.654},
-		{"ian mckinley", "tian xiang 7", 0.5},
-		{"bindaree food group pty", precompute("independent insurance group ltd"), 0.659}, // precompute removes ltd
-		{"bindaree food group pty ltd", "independent insurance group ltd", 0.728},         // only matches higher from 'ltd'
-		{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.602},
-		{"zincum llc", "easy verification inc.", 0.426},
-		{"transpetrochart co ltd", "jx metals trading co.", 0.544},
-		{"technolab", "moomoo technologies inc", 0.291},
-		{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.247},
-		{"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0},
+		{"ian", "ian mckinley", 0.429},
+		{"iap", "ian mckinley", 0.352},
+		{"ian mckinley", "ian", 0.891},
+		{"ian mckinley", "iap", 0.733},
+		{"ian mckinley", "tian xiang 7", 0.526},
+		{"bindaree food group pty", precompute("independent insurance group ltd"), 0.576}, // precompute removes ltd
+		{"bindaree food group pty ltd", "independent insurance group ltd", 0.631},         // only matches higher from 'ltd'
+		{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.658},
+		{"zincum llc", "easy verification inc.", 0.380},
+		{"transpetrochart co ltd", "jx metals trading co.", 0.496},
+		{"technolab", "moomoo technologies inc", 0.565},
+		{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.480},
+		{"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.094},
 
 		// example cases
-		{"nicolas maduro", "nicolás maduro", 0.961},
+		{"nicolas maduro", "nicolás maduro", 0.937},
 		{"nicolas maduro", precompute("nicolás maduro"), 1.0},
-		{"nic maduro", "nicolas maduro", 0.717},
-		{"nick maduro", "nicolas maduro", 0.769},
-		{"nicolas maduroo", "nicolas maduro", 0.986},
+		{"nic maduro", "nicolas maduro", 0.872},
+		{"nick maduro", "nicolas maduro", 0.859},
+		{"nicolas maduroo", "nicolas maduro", 0.966},
 		{"nicolas maduro", "nicolas maduro", 1.0},
 		{"maduro, nicolas", "maduro, nicolas", 1.0},
 		{"maduro moros, nicolas", "maduro moros, nicolas", 1.0},
-		{"maduro moros, nicolas", "nicolas maduro", 0.889},
-		{"nicolas maduro moros", "maduro", 0.722},
-		{"nicolas maduro moros", "nicolás maduro", 0.884},
-		{"nicolas, maduro moros", "maduro", 0.720},
-		{"nicolas, maduro moros", "nicolas maduro", 0.902},
-		{"nicolas, maduro moros", "nicolás", 0.554},
-		{"nicolas, maduro moros", "maduro", 0.720},
-		{"nicolas, maduro moros", "nicolás maduro", 0.877},
-		{"africada financial services bureau change", "skylight", 0.266},
-		{"africada financial services bureau change", "skylight financial inc", 0.596},
-		{"africada financial services bureau change", "skylight services inc", 0.645},
-		{"africada financial services bureau change", "skylight financial services", 0.67},
-		{"africada financial services bureau change", "skylight financial services inc", 0.696},
+		{"maduro moros, nicolas", "nicolas maduro", 0.953},
+		{"nicolas maduro moros", "maduro", 0.900},
+		{"nicolas maduro moros", "nicolás maduro", 0.898},
+		{"nicolas, maduro moros", "maduro", 0.897},
+		{"nicolas, maduro moros", "nicolas maduro", 0.928},
+		{"nicolas, maduro moros", "nicolás", 0.822},
+		{"nicolas, maduro moros", "maduro", 0.897},
+		{"nicolas, maduro moros", "nicolás maduro", 0.906},
+		{"africada financial services bureau change", "skylight", 0.441},
+		{"africada financial services bureau change", "skylight financial inc", 0.658},
+		{"africada financial services bureau change", "skylight services inc", 0.621},
+		{"africada financial services bureau change", "skylight financial services", 0.761},
+		{"africada financial services bureau change", "skylight financial services inc", 0.730},
 
 		// stopwords tests
-		{"the group for the preservation of the holy sites", "the bridgespan group", 0.448},
-		{precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.448},
-		{"group preservation holy sites", "bridgespan group", 0.619},
+		{"the group for the preservation of the holy sites", "the bridgespan group", 0.682},
+		{precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.682},
+		{"group preservation holy sites", "bridgespan group", 0.652},
 
-		{"the group for the preservation of the holy sites", "the logan group", 0.424},
-		{precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.424},
-		{"group preservation holy sites", "logan group", 0.478},
+		{"the group for the preservation of the holy sites", "the logan group", 0.730},
+		{precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.730},
+		{"group preservation holy sites", "logan group", 0.649},
 
-		{"the group for the preservation of the holy sites", "the anything group", 0.437},
-		{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.437},
+		{"the group for the preservation of the holy sites", "the anything group", 0.698},
+		{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.698},
 		{"group preservation holy sites", "anything group", 0.585},
 
-		{"the group for the preservation of the holy sites", "the hello world group", 0.47},
-		{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.47},
-		{"group preservation holy sites", "hello world group", 0.515},
+		{"the group for the preservation of the holy sites", "the hello world group", 0.706},
+		{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.706},
+		{"group preservation holy sites", "hello world group", 0.560},
 
-		{"the group for the preservation of the holy sites", "the group", 0.416},
-		{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.416},
-		{"group preservation holy sites", "group", 0.460},
+		{"the group for the preservation of the holy sites", "the group", 0.880},
+		{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.880},
+		{"group preservation holy sites", "group", 0.879},
 
-		{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.403},
+		{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.426},
 		{
 			precompute("the group for the preservation of the holy sites"),
 			precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"),
-			0.459,
+			0.446,
 		},
-		{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.239},
+		{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.334},
 
 		// precompute
-		{"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.5},
-		{precompute("A.I.C. SOGO KENKYUSHO"), "sogo kenkyusho", 0.667},
+		{"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.858},
+		{precompute("A.I.C. SOGO KENKYUSHO"), "sogo kenkyusho", 0.972},
 	}
 	for i := range cases {
 		v := cases[i]
 		// Only need to call chomp on s1, see jaroWinkler doc
-		eql(t, fmt.Sprintf("#%d %s vs %s", i, v.s1, v.s2), jaroWinkler(v.s1, v.s2), v.match)
+		eql(t, fmt.Sprintf("#%d %s vs %s", i, v.indexed, v.search), bestPairsJaroWinkler(strings.Fields(v.search), v.indexed), v.match)
 	}
 }
 

From 19bae9299933f3c611e22121008c6b354a95b5d9 Mon Sep 17 00:00:00 2001
From: Tom Daffurn <tdaffurn@tbd.email>
Date: Mon, 11 Dec 2023 10:44:19 +1000
Subject: [PATCH 4/7] Clean up test

---
 cmd/server/new_algorithm_test.go | 96 ++++++++++++--------------------
 1 file changed, 36 insertions(+), 60 deletions(-)

diff --git a/cmd/server/new_algorithm_test.go b/cmd/server/new_algorithm_test.go
index b3945ade1..300672285 100644
--- a/cmd/server/new_algorithm_test.go
+++ b/cmd/server/new_algorithm_test.go
@@ -13,88 +13,64 @@ func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) {
 	// Words in the query should be matched against at most one indexed word. Doubled names on the sanctioned list can
 	// skew results
 	// 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich
-	query := "vladimir levenshtein"
-	indexedName := "vladimirov vladimir vladimirovich"
-	score1 := jaroWinkler(indexedName, query)
-	score2 := bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too high: "+query, score1, 0.961)
-	eql(t, "New score is better: "+query, score2, 0.603)
+	oldScore, newScore := compareAlgorithms("vladimirov vladimir vladimirovich", "vladimir levenshtein")
+	eql(t, "Score is too high", oldScore, 0.961)
+	eql(t, "New score is better", newScore, 0.603)
 
 	// 2. SDN Entity 7788 "SHAQIRI, Shaqir"
-	query = "zaid shakir"
-	indexedName = "shaqiri shaqir"
-	score1 = jaroWinkler(indexedName, query)
-	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too high: "+query, score1, 0.908)
-	eql(t, "New score is better: "+query, score2, 0.704)
+	oldScore, newScore = compareAlgorithms("shaqiri shaqir", "zaid shakir")
+	eql(t, "Score is too high", oldScore, 0.908)
+	eql(t, "New score is better", newScore, 0.704)
 
 	// Single-word sanctioned names shouldn't match any query with that name part
 	// 1. SDN Entity 15050 "HADI"
-	query = "hadi alwai"
-	indexedName = "hadi"
-	score1 = jaroWinkler(indexedName, query)
-	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too high: "+query, score1, 0.900)
-	eql(t, "New score is better: "+query, score2, 0.615)
+	oldScore, newScore = compareAlgorithms("hadi", "hadi alwai")
+	eql(t, "Score is too high", oldScore, 0.900)
+	eql(t, "New score is better", newScore, 0.615)
 
 	// Name-part scores should be weighted by the character length. If not, small words can have unfair weight
 	// 1. SDN Entity "LI, Shangfu"
-	query = "li shanlan"
-	indexedName = "li shangfu"
-	score1 = jaroWinkler(indexedName, query)
-	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too high: "+query, score1, 0.914)
-	eql(t, "New score is better: "+query, score2, 0.867)
+	oldScore, newScore = compareAlgorithms("li shangfu", "li shanlan")
+	eql(t, "Score is too high", oldScore, 0.914)
+	eql(t, "New score is better", newScore, 0.867)
 
 	// Words with different lengths shouldn't match very highly
-	query = "brown"
-	indexedName = "browningweight"
-	score1 = jaroWinkler(indexedName, query)
-	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too high: "+query, score1, 0.871)
-	eql(t, "New score is better: "+query, score2, 0.703)
+	oldScore, newScore = compareAlgorithms("browningweight", "brown")
+	eql(t, "Score is too high", oldScore, 0.871)
+	eql(t, "New score is better", newScore, 0.703)
 
 	// Words that start with different letters shouldn't match very highly
-	query = "jimenez"
-	indexedName = "dominguez"
-	score1 = jaroWinkler(indexedName, query)
-	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too high: "+query, score1, 0.690)
-	eql(t, "New score is better: "+query, score2, 0.580)
+	oldScore, newScore = compareAlgorithms("dominguez", "jimenez")
+	eql(t, "Score is too high", oldScore, 0.690)
+	eql(t, "New score is better", newScore, 0.580)
 }
 
 func TestBestPairsJaroWinkler__TruePositives(t *testing.T) {
 	// Unmatched indexed words had a large weight, causing false negatives for missing "middle names"
 	// 1. Saddam Hussein
-	query := "saddam hussien"
-	indexedName := "saddam hussein al tikriti"
-	score1 := jaroWinkler(indexedName, query)
-	score2 := bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too low: "+query, score1, 0.656)
-	eql(t, "New score is better: "+query, score2, 0.924)
+	oldScore, newScore := compareAlgorithms("saddam hussein al tikriti", "saddam hussien")
+	eql(t, "Score is too low", oldScore, 0.656)
+	eql(t, "New score is better", newScore, 0.924)
 
 	// 2. SDN Entity 7574 "VALENCIA TRUJILLO, Joaquin Mario"
-	query = "valencia trujillo joaquin"
-	indexedName = "valencia trujillo joaquin mario"
-	score1 = jaroWinkler(indexedName, query)
-	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too low: "+query, score1, 0.868)
-	eql(t, "New score is better: "+query, score2, 0.973)
+	oldScore, newScore = compareAlgorithms("valencia trujillo joaquin mario", "valencia trujillo joaquin")
+	eql(t, "Score is too low", oldScore, 0.868)
+	eql(t, "New score is better", newScore, 0.973)
 
 	// 3. SDN Entity 9760 "LUKASHENKO, Alexander Grigoryevich"
-	query = "alexander lukashenko"
-	indexedName = "lukashenko alexander grigoryevich"
-	score1 = jaroWinkler(indexedName, query)
-	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too low: "+query, score1, 0.765)
-	eql(t, "New score is better: "+query, score2, 0.942)
+	oldScore, newScore = compareAlgorithms("lukashenko alexander grigoryevich", "alexander lukashenko")
+	eql(t, "Score is too low", oldScore, 0.765)
+	eql(t, "New score is better", newScore, 0.942)
 
 	// Small words had too much weight, causing false negatives
 	// 1. SDN Entity 4691 "A.I.C. SOGO KENKYUSHO"
-	query = "sogo kenkyusho"
-	indexedName = "a i c sogo kenkyusho"
-	score1 = jaroWinkler(indexedName, query)
-	score2 = bestPairsJaroWinkler(strings.Fields(query), indexedName)
-	eql(t, "Score is too low: "+query, score1, 0.400)
-	eql(t, "New score is better: "+query, score2, 0.972)
+	oldScore, newScore = compareAlgorithms("a i c sogo kenkyusho", "sogo kenkyusho")
+	eql(t, "Score is too low", oldScore, 0.400)
+	eql(t, "New score is better", newScore, 0.972)
+}
+
+func compareAlgorithms(indexedName string, query string) (float64, float64) {
+	oldScore := jaroWinkler(indexedName, query)
+	newScore := bestPairsJaroWinkler(strings.Fields(query), indexedName)
+	return oldScore, newScore
 }

From 336b96c9d4a6a74abfc2a7be3820bf87a475c8f6 Mon Sep 17 00:00:00 2001
From: Tom Daffurn <tdaffurn@tbd.email>
Date: Wed, 13 Dec 2023 09:09:19 +1000
Subject: [PATCH 5/7] Fix linter issue

---
 cmd/server/search.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/server/search.go b/cmd/server/search.go
index 17002d4b6..1260d9f36 100644
--- a/cmd/server/search.go
+++ b/cmd/server/search.go
@@ -330,7 +330,7 @@ func bestPairsJaroWinkler(searchTokens []string, indexed string) float64 {
 	totalWeightedScores := 0.0
 	for _, score := range scores {
 		//If neither the search token nor index token have been matched so far
-		if matchedSearchTokens[score.searchTokenIdx] == false && matchedIndexTokens[score.indexTokenIdx] == false {
+		if !matchedSearchTokens[score.searchTokenIdx] && !matchedIndexTokens[score.indexTokenIdx] {
 			//Weight the importance of this word score by its character length
 			searchToken := searchTokens[score.searchTokenIdx]
 			indexToken := indexedTokens[score.indexTokenIdx]

From b99f1e3523cc00e9330a78b2d1d4ea48afdfd92d Mon Sep 17 00:00:00 2001
From: Tom Daffurn <tdaffurn@tbd.email>
Date: Wed, 13 Dec 2023 09:09:36 +1000
Subject: [PATCH 6/7] Update config settings

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2c7dd7968..80033f06a 100644
--- a/README.md
+++ b/README.md
@@ -184,7 +184,7 @@ You should get this response:
 PONG
 ```
 
-### Configuration settings
+### Configuration settings 
 
 | Environmental Variable | Description | Default |
 |-----|-----|-----|
@@ -192,6 +192,10 @@ PONG
 | `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
 | `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
 | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
+| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9       | 
+| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3    |
+| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9   |
+| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15    |
 | `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
 | `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
 | `WEBHOOK_BATCH_SIZE` | How many watches to read from database per batch of async searches. | 100 |

From ddb46e7f2f4d950c329730bec4c76d9e39329755 Mon Sep 17 00:00:00 2001
From: Tom Daffurn <tdaffurn@tbd.email>
Date: Wed, 13 Dec 2023 09:23:44 +1000
Subject: [PATCH 7/7] More linter fixes

---
 cmd/server/search.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/server/search.go b/cmd/server/search.go
index 1260d9f36..aa73c12c4 100644
--- a/cmd/server/search.go
+++ b/cmd/server/search.go
@@ -349,7 +349,7 @@ func bestPairsJaroWinkler(searchTokens []string, indexed string) float64 {
 	//unmatched portion, then scale down the final score.
 	matchedIndexLength := 0
 	for i, str := range indexedTokens {
-		if matchedIndexTokens[i] == true {
+		if matchedIndexTokens[i] {
 			matchedIndexLength += len(str)
 		}
 	}