Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce Jaro term proximity #511

Merged
merged 2 commits into from
Nov 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ PONG
|-----|-----|-----|
| `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
| `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
| `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
| `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
| `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
Expand Down
10 changes: 5 additions & 5 deletions cmd/server/issue326_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ func TestIssue326(t *testing.T) {

// Cuba
score := jaroWinkler(precompute("Huawei Cuba"), precompute("Huawei"))
assert.Equal(t, score, 0.8055555555555556)
assert.Equal(t, 0.8055555555555556, score)

// India
score = jaroWinkler(india, precompute("Huawei"))
assert.Equal(t, score, 0.5592063492063492)
assert.Equal(t, 0.4846031746031746, score)
score = jaroWinkler(india, precompute("Huawei Technologies"))
assert.Equal(t, score, 0.6903174603174603)
assert.Equal(t, 0.6903174603174603, score)

// Investment
score = jaroWinkler(investment, precompute("Huawei"))
assert.Equal(t, score, 0.3788888888888889)
assert.Equal(t, 0.3788888888888889, score)
score = jaroWinkler(investment, precompute("Huawei Technologies"))
assert.Equal(t, score, 0.7377777777777779)
assert.Equal(t, 0.7377777777777779, score)
}
26 changes: 19 additions & 7 deletions cmd/server/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,21 +648,34 @@ func readInt(override string, value int) int {
// jaroWinkler runs the similarly named algorithm over the two input strings and averages their match percentages
// according to the second string (assumed to be the user's query)
//
// Terms are compared between a few adjacent terms and accumulate the highest near-neighbor match.
//
// For more details see https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
func jaroWinkler(s1, s2 string) float64 {
return jaroWinklerWithFavoritism(s1, s2, exactMatchFavoritism)
}

var (
adjacentSimilarityPositions = readInt(os.Getenv("ADJACENT_SIMILARITY_POSITIONS"), 3)
)

func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
maxMatch := func(word string, parts []string) float64 {
maxMatch := func(word string, s1Idx int, parts []string) float64 {
if len(parts) == 0 {
return 0.0
}

max := smetrics.JaroWinkler(word, parts[0], boostThreshold, prefixSize)
for i := 1; i < len(parts); i++ {
if score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize); score > max {
max = score
// We're only looking for the highest match close
start := s1Idx - adjacentSimilarityPositions
end := s1Idx + adjacentSimilarityPositions

var max float64
for i := start; i < end; i++ {
if i >= 0 && len(parts) > i {
score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize)
if score > max {
max = score
}
}
}
return max
Expand All @@ -675,7 +688,7 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {

var scores []float64
for i := range s1Parts {
max := maxMatch(s1Parts[i], s2Parts)
max := maxMatch(s1Parts[i], i, s2Parts)
if max >= 1.0 {
max += favoritism
}
Expand All @@ -692,7 +705,6 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
for i := range scores {
sum += scores[i]
}

return math.Min(sum/float64(len(scores)), 1.00)
}

Expand Down
26 changes: 17 additions & 9 deletions cmd/server/search_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,15 @@ func TestJaroWinkler(t *testing.T) {
{"nicolas", "nicolas", 1.0},
{"nicolas moros maduro", "nicolas maduro", 0.91},
{"nicolas maduro", "nicolas moros maduro", 1.0},
// customer examples
{"ian mckinley", "tian xiang 7", 0.750},
{"bindaree food group pty ltd", "independent insurance group ltd", 0.812},
{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.753},
{"zincum llc", "easy verification inc.", 0.639},
{"transpetrochart co ltd", "jx metals trading co.", 0.725},
{"technolab", "moomoo technologies inc", 0.87222},
{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.740},
{"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0},
// example cases
{"nicolas maduro", "nicolás maduro", 0.961},
{"nicolas maduro", precompute("nicolás maduro"), 1.0},
Expand All @@ -465,12 +474,11 @@ func TestJaroWinkler(t *testing.T) {
{"nicolas, maduro moros", "nicolás", 0.627},
{"nicolas, maduro moros", "maduro", 0.720},
{"nicolas, maduro moros", "nicolás maduro", 0.877},
{"africada financial services bureau change", "skylight", 0.352},
{"africada financial services bureau change", "skylight", 0.266},
{"africada financial services bureau change", "skylight financial inc", 0.72},
{"africada financial services bureau change", "skylight services inc", 0.806},
{"africada financial services bureau change", "skylight financial services", 0.887},
{"africada financial services bureau change", "skylight financial services inc", 0.79},

// stopwords tests
{"the group for the preservation of the holy sites", "the bridgespan group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 1.00},
Expand All @@ -481,15 +489,15 @@ func TestJaroWinkler(t *testing.T) {
{"the group for the preservation of the holy sites", "the anything group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 1.00},
{"group preservation holy sites", "anything group", 0.617},
{"the group for the preservation of the holy sites", "the hello world group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 1.00},
{"the group for the preservation of the holy sites", "the hello world group", 0.922},
{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.922},
{"group preservation holy sites", "hello world group", 0.687},
{"the group for the preservation of the holy sites", "the group", 0.67},
{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.67},
{"the group for the preservation of the holy sites", "the group", 0.431},
{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.431},
{"group preservation holy sites", "group", 0.460},
{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.699},
{precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), .783},
{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.590},
{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.517},
{precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.572},
{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.418},

// precompute
{"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.667},
Expand Down
1 change: 1 addition & 0 deletions docs/usage-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ menubar: docs-menu
|-----|-----|-----|
| `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
| `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
| `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
| `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
| `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
Expand Down
Loading