Skip to content

Commit

Permalink
Added boost for matches directly following the last period (#708)
Browse files Browse the repository at this point in the history
* Added boost for matches directly following the last period

If something matches starting after the last period (of a likely
module) it is boosted. This makes a match for `MyModule.Structs.User`
take precedence over `MyModule.User.Something` while querying for `User`

* Simplified scoring rules

  * Removed case match bonus in favor of camel case bonus
  * Removed pattern length bonus, since it's the same for all subjects
  * Changed incompleteness penalty to be based on the number of
    characters matched in the subject rather than the size of the
    pattern
* dropped min query length to 1
  • Loading branch information
scohen authored Apr 24, 2024
1 parent 3833c18 commit 2f6ea89
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 48 deletions.
150 changes: 110 additions & 40 deletions apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do
2. Patterns that match more consecutive characters
3. Patterns that match the beginning of the subject
4. Patterns that match the case of the subject
5. Patterns that match the tail of a subject starting at the last period
Based loosely on https://medium.com/@Srekel/implementing-a-fuzzy-search-algorithm-for-the-debuginator-cacc349e6c55
"""
Expand All @@ -22,7 +23,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do

import Record

defrecord :subject, graphemes: nil, normalized: nil
defrecord :subject, graphemes: nil, normalized: nil, period_positions: [-1]

@typedoc "A match score. Higher numbers mean a more relevant match."
@type score :: integer
Expand All @@ -45,7 +46,13 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do
|> String.graphemes()
|> List.to_tuple()

subject(graphemes: graphemes, normalized: normalize(subject))
normalized = normalize(subject)

subject(
graphemes: graphemes,
normalized: normalized,
period_positions: period_positions(normalized)
)
end

def preprocess(subject) do
Expand Down Expand Up @@ -85,20 +92,25 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do
end
end

defp collect_scores(normalized, normalized_pattern, acc \\ [])
defp collect_scores(normalized, normalized_pattern, starting_index \\ 0, acc \\ [])

defp collect_scores(normalized_subject, normalized_pattern, acc) do
defp collect_scores(normalized_subject, normalized_pattern, starting_index, scores) do
# we collect scores because it's possible that a better match occurs later
# in the subject, and if we start peeling off characters greedily, we'll miss
# it. This is more expensive, but it's still pretty quick.

case do_score(normalized_subject, normalized_pattern, %__MODULE__{}) do
initial_score = %__MODULE__{index: starting_index}

case do_score(normalized_subject, normalized_pattern, initial_score) do
%__MODULE__{match?: true, matched_character_positions: [pos | _]} = score ->
subject_substring = String.slice(normalized_subject, (pos + 1)..-1//1)
collect_scores(subject_substring, normalized_pattern, [score | acc])
slice_start = pos + 1
next_index = starting_index + slice_start
subject_substring = String.slice(normalized_subject, slice_start..-1//1)
scores = [score | scores]
collect_scores(subject_substring, normalized_pattern, next_index, scores)

_ ->
acc
scores
end
end

Expand Down Expand Up @@ -150,36 +162,54 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do
@non_match_score
end

defp calculate_score(%__MODULE__{} = score, subject() = subject, pattern) do
defp calculate_score(%__MODULE__{} = score, subject(graphemes: graphemes) = subject, pattern) do
pattern_length = String.length(pattern)

{consecutive_count, consecutive_bonus} =
consecutive_match_bonus(score.matched_character_positions)

match_amount_boost = consecutive_count * pattern_length * 10

[first_match_position | _] = score.matched_character_positions
consecutive_match_boost(score.matched_character_positions)

pattern_length_boost = pattern_length
match_amount_boost = consecutive_count * pattern_length

# penalize first matches further in the string by making them negative.
first_match_bonus = max(0 - first_match_position, 10)
match_boost = tail_match_boost(score, subject, pattern_length)

case_match_boost = case_match_boost(pattern, score.matched_character_positions, subject)
camel_case_boost = camel_case_boost(score.matched_character_positions, subject)

mismatched_penalty = mismatched_penalty(score.matched_character_positions)

pattern_length_boost + consecutive_bonus + first_match_bonus + case_match_boost +
match_amount_boost - mismatched_penalty
incompleteness_penalty = tuple_size(graphemes) - length(score.matched_character_positions)

consecutive_bonus + match_boost + camel_case_boost +
match_amount_boost - mismatched_penalty - incompleteness_penalty
end

defp normalize(string) do
String.downcase(string)
end

@tail_match_boost 55

defp tail_match_boost(
%__MODULE__{} = score,
subject(graphemes: graphemes, period_positions: period_positions),
pattern_length
) do
[first_match_position | _] = score.matched_character_positions

match_end = first_match_position + pattern_length
subject_length = tuple_size(graphemes)

if MapSet.member?(period_positions, first_match_position - 1) and match_end == subject_length do
# reward a complete match at the end of the last period. This is likely a module
# and the pattern matches the most local parts
@tail_match_boost
else
0
end
end

@consecutive_character_bonus 15

def consecutive_match_bonus(matched_positions) do
def consecutive_match_boost(matched_positions) do
# This function checks for consecutive matched characters, and
# makes matches with more consecutive matched characters worth more.
# This means if I type En, it will match Enum more than it will match
Expand All @@ -205,36 +235,76 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do
def mismatched_penalty(matched_positions) do
{penalty, _} =
matched_positions
|> Enum.reduce({0, -1}, fn matched_position, {penalty, last_match} ->
distance = matched_position - last_match
{penalty + distance * @mismatched_chracter_penalty, matched_position}
|> Enum.reduce({0, -1}, fn
matched_position, {0, _} ->
# only start counting the penalty after the first match,
# otherwise we will inadvertently penalize matches deeper in the string
{0, matched_position}

matched_position, {penalty, last_match} ->
distance = matched_position - last_match

{penalty + distance * @mismatched_chracter_penalty, matched_position}
end)

penalty
end

defp case_match_boost(pattern, matched_positions, subject(graphemes: graphemes)) do
do_case_match_boost(pattern, matched_positions, graphemes, 0)
@camel_case_boost 5
defp camel_case_boost(matched_positions, subject(graphemes: graphemes)) do
graphemes
|> Tuple.to_list()
|> camel_positions()
|> Enum.reduce(0, fn position, score ->
if position in matched_positions do
score + @camel_case_boost
else
score
end
end)
end

# iterate over the matches, find the character in the subject with that index, and compare it
# to the one in the pattern, boost if they're the same.
defp do_case_match_boost(_, [], _, boost), do: boost
defp camel_positions(graphemes) do
camel_positions(graphemes, {nil, :lower}, 0, [])
end

defp do_case_match_boost(<<char::utf8, pattern_rest::binary>>, [index | rest], graphemes, boost) do
boost =
if grapheme_to_utf8(graphemes, index) == char do
boost + 1
else
boost
end
defp camel_positions([], _, _, positions) do
Enum.reverse(positions)
end

defp camel_positions([grapheme | rest], {_last_char, :lower}, position, positions) do
case case_of(grapheme) do
:lower ->
camel_positions(rest, {grapheme, :lower}, position + 1, positions)

:upper ->
camel_positions(rest, {grapheme, :upper}, position + 1, [position | positions])
end
end

defp camel_positions([grapheme | rest], {_last_char, :upper}, position, positions) do
camel_positions(rest, {grapheme, case_of(grapheme)}, position + 1, positions)
end

defp case_of(grapheme) do
if String.downcase(grapheme) == grapheme do
:lower
else
:upper
end
end

do_case_match_boost(pattern_rest, rest, graphemes, boost)
defp period_positions(string) do
period_positions(string, 0, [-1])
end

defp grapheme_to_utf8(graphemes, position) do
<<c::utf8>> = elem(graphemes, position)
defp period_positions(<<>>, _, positions), do: MapSet.new(positions)

defp period_positions(<<".", rest::binary>>, position, positions) do
period_positions(rest, position + 1, [position | positions])
end

c
defp period_positions(<<_::utf8, rest::binary>>, position, positions) do
period_positions(rest, position + 1, positions)
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do
end

describe "matching heuristics" do
@tag :skip
test "more complete matches are boosted" do
results =
score_and_sort(
Expand All @@ -53,7 +52,6 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do
~w(Lexical.Document Lexical.Document.Range Something.Else.Lexical.Other.Type.Document.Thing)
end

@tag :skip
test "matches at the beginning of the string are boosted" do
results =
score_and_sort(
Expand All @@ -64,16 +62,36 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do
assert results == ~w(Document Something.Document Something.Else.Document)
end

@tag :skip
test "patterns that match consecutive characters are boosted" do
results = score_and_sort(~w(axxxbxxxcxxxdxxx axxbxxcxxdxx axbxcxdx abcd), "abcd")
assert results == ~w(abcd axbxcxdx axxbxxcxxdxx axxxbxxxcxxxdxxx)
end

@tag :skip
test "patterns that match the case are boosted" do
results = score_and_sort(~w(stinky stinkY StiNkY STINKY), "STINKY")
assert results == ~w(STINKY StiNkY stinkY stinky)
test "patterns that match camel case are boosted" do
results =
score_and_sort(
~w(lotsofcamelcase LotsofcamelCase LotsofCamelCase LotsOfCamelCase),
"LotsOfCamelCase"
)

assert results == ~w(LotsOfCamelCase LotsofCamelCase LotsofcamelCase lotsofcamelcase)
end

test "matches at the end of a module are boosted" do
results =
score_and_sort(
~w(First.Third.Second Third.First.Second First.Second.Third),
"Third"
)

assert ["First.Second.Third" | _] = results
end

test "tail matches are boosted" do
results =
score_and_sort(~w(create_user save_user Foo.Bar.Baz.Demo.Accounts.LiveDemo.User), "User")

assert ["Foo.Bar.Baz.Demo.Accounts.LiveDemo.User" | _] = results
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ defmodule Lexical.Server.Provider.Handlers.WorkspaceSymbol do

def handle(%WorkspaceSymbol{} = request, %Env{} = env) do
symbols =
if String.length(request.query) > 3 do
if String.length(request.query) > 1 do
env.project
|> Api.workspace_symbols(request.query)
|> tap(fn symbols -> Logger.info("syms #{inspect(Enum.take(symbols, 5))}") end)
Expand Down

0 comments on commit 2f6ea89

Please sign in to comment.