From c586806616f60ba08a657beb019996d7e9ba20a7 Mon Sep 17 00:00:00 2001 From: Steve Cohen Date: Fri, 19 Apr 2024 11:19:17 -0700 Subject: [PATCH 1/5] Added boost for matches directly following the last period If something matches starting after the last period (of a likely module) it is boosted. This makes a match for `MyModule.Structs.User` take precedence over `MyModule.User.Something` while querying for `User` --- .../remote_control/search/fuzzy/scorer.ex | 93 +++++++++++++++---- .../search/fuzzy/scorer_test.exs | 14 ++- 2 files changed, 84 insertions(+), 23 deletions(-) diff --git a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex index 2fee53bad..3003a0f1e 100644 --- a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex +++ b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex @@ -13,6 +13,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do 2. Patterns that match more consecutive characters 3. Patterns that match the beginning of the subject 4. Patterns that match the case of the subject + 5. Patterns that match after the last period Based loosely on https://medium.com/@Srekel/implementing-a-fuzzy-search-algorithm-for-the-debuginator-cacc349e6c55 """ @@ -22,7 +23,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do import Record - defrecord :subject, graphemes: nil, normalized: nil + defrecord :subject, graphemes: nil, normalized: nil, last_period_position: nil @typedoc "A match score. Higher numbers mean a more relevant match." @type score :: integer @@ -45,7 +46,14 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do |> String.graphemes() |> List.to_tuple() - subject(graphemes: graphemes, normalized: normalize(subject)) + normalized = normalize(subject) + last_period = last_period_position(normalized) + + subject( + graphemes: graphemes, + normalized: normalized, + last_period_position: last_period + ) end def preprocess(subject) do @@ -85,20 +93,25 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do end end - defp collect_scores(normalized, normalized_pattern, acc \\ []) + defp collect_scores(normalized, normalized_pattern, starting_index \\ 0, acc \\ []) - defp collect_scores(normalized_subject, normalized_pattern, acc) do + defp collect_scores(normalized_subject, normalized_pattern, starting_index, scores) do # we collect scores because it's possible that a better match occurs later # in the subject, and if we start peeling off characters greedily, we'll miss # it. This is more expensive, but it's still pretty quick. - case do_score(normalized_subject, normalized_pattern, %__MODULE__{}) do + initial_score = %__MODULE__{index: starting_index} + + case do_score(normalized_subject, normalized_pattern, initial_score) do %__MODULE__{match?: true, matched_character_positions: [pos | _]} = score -> - subject_substring = String.slice(normalized_subject, (pos + 1)..-1//1) - collect_scores(subject_substring, normalized_pattern, [score | acc]) + slice_start = pos + 1 + next_index = starting_index + slice_start + subject_substring = String.slice(normalized_subject, slice_start..-1//1) + scores = [score | scores] + collect_scores(subject_substring, normalized_pattern, next_index, scores) _ -> - acc + scores end end @@ -150,33 +163,54 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do @non_match_score end - defp calculate_score(%__MODULE__{} = score, subject() = subject, pattern) do + defp calculate_score(%__MODULE__{} = score, subject(graphemes: graphemes) = subject, pattern) do pattern_length = String.length(pattern) {consecutive_count, consecutive_bonus} = consecutive_match_bonus(score.matched_character_positions) - match_amount_boost = consecutive_count * pattern_length * 10 - - [first_match_position | _] = score.matched_character_positions + match_amount_boost = consecutive_count * pattern_length pattern_length_boost = pattern_length - # penalize first matches further in the string by making them negative. - first_match_bonus = max(0 - first_match_position, 10) + match_bonus = match_bonus(score, subject) case_match_boost = case_match_boost(pattern, score.matched_character_positions, subject) mismatched_penalty = mismatched_penalty(score.matched_character_positions) - pattern_length_boost + consecutive_bonus + first_match_bonus + case_match_boost + - match_amount_boost - mismatched_penalty + incompleteness_penalty = tuple_size(graphemes) - pattern_length + + result = + pattern_length_boost + consecutive_bonus + match_bonus + case_match_boost + + match_amount_boost - mismatched_penalty - incompleteness_penalty + + result end defp normalize(string) do String.downcase(string) end + @last_period_boost 15 + @max_match_bonus_boost 10 + defp match_bonus(%__MODULE__{} = score, subject(last_period_position: nil)) do + # penalize first matches further in the string by making them negative. + [first_match_position | _] = score.matched_character_positions + max(0 - first_match_position, @max_match_bonus_boost) + end + + defp match_bonus(%__MODULE__{} = score, subject(last_period_position: last_period)) do + [first_match_position | _] = score.matched_character_positions + + if first_match_position == last_period + 1 do + @last_period_boost + else + # penalize first matches further in the string by making them negative. + max(0 - first_match_position, @max_match_bonus_boost) + end + end + @consecutive_character_bonus 15 def consecutive_match_bonus(matched_positions) do @@ -205,9 +239,16 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do def mismatched_penalty(matched_positions) do {penalty, _} = matched_positions - |> Enum.reduce({0, -1}, fn matched_position, {penalty, last_match} -> - distance = matched_position - last_match - {penalty + distance * @mismatched_chracter_penalty, matched_position} + |> Enum.reduce({0, -1}, fn + matched_position, {0, _} -> + # only start counting the penalty after the first match, + # otherwise we will inadvertently penalize matches deeper in the string + {0, matched_position} + + matched_position, {penalty, last_match} -> + distance = matched_position - last_match + + {penalty + distance * @mismatched_chracter_penalty, matched_position} end) penalty @@ -237,4 +278,18 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do c end + + defp last_period_position(string) do + last_period_position(string, 0, nil) + end + + defp last_period_position(<<>>, _, position), do: position + + defp last_period_position(<<".", rest::binary>>, position, _) do + last_period_position(rest, position + 1, position) + end + + defp last_period_position(<<_::utf8, rest::binary>>, position, last_found) do + last_period_position(rest, position + 1, last_found) + end end diff --git a/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs b/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs index 638a1958a..1c664e67d 100644 --- a/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs +++ b/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs @@ -41,7 +41,6 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do end describe "matching heuristics" do - @tag :skip test "more complete matches are boosted" do results = score_and_sort( @@ -53,7 +52,6 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do ~w(Lexical.Document Lexical.Document.Range Something.Else.Lexical.Other.Type.Document.Thing) end - @tag :skip test "matches at the beginning of the string are boosted" do results = score_and_sort( @@ -64,16 +62,24 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do assert results == ~w(Document Something.Document Something.Else.Document) end - @tag :skip test "patterns that match consecutive characters are boosted" do results = score_and_sort(~w(axxxbxxxcxxxdxxx axxbxxcxxdxx axbxcxdx abcd), "abcd") assert results == ~w(abcd axbxcxdx axxbxxcxxdxx axxxbxxxcxxxdxxx) end - @tag :skip test "patterns that match the case are boosted" do results = score_and_sort(~w(stinky stinkY StiNkY STINKY), "STINKY") assert results == ~w(STINKY StiNkY stinkY stinky) end + + test "matches at the end of a module are boosted" do + results = + score_and_sort( + ~w(First.Third.Second Third.First.Second First.Second.Third), + "Third" + ) + + assert ["First.Second.Third" | _] = results + end end end From d28825d93a5c49e3f2be31679555fe1e0b68af3f Mon Sep 17 00:00:00 2001 From: Steve Cohen Date: Mon, 22 Apr 2024 10:56:55 -0700 Subject: [PATCH 2/5] Simplified scoring rules * Removed case match bonus in favor of camel case bonus * Removed pattern length bonus, since it's the same for all subjects * Changed incompleteness penalty to be based on the number of characters matched in the subject rather than the size of the pattern --- .../remote_control/search/fuzzy/scorer.ex | 80 +++++++++++-------- .../search/fuzzy/scorer_test.exs | 17 +++- 2 files changed, 61 insertions(+), 36 deletions(-) diff --git a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex index 3003a0f1e..62254a84f 100644 --- a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex +++ b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex @@ -167,25 +167,20 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do pattern_length = String.length(pattern) {consecutive_count, consecutive_bonus} = - consecutive_match_bonus(score.matched_character_positions) + consecutive_match_boost(score.matched_character_positions) match_amount_boost = consecutive_count * pattern_length - pattern_length_boost = pattern_length + match_boost = match_boost(score, subject) - match_bonus = match_bonus(score, subject) - - case_match_boost = case_match_boost(pattern, score.matched_character_positions, subject) + camel_case_boost = camel_case_boost(score.matched_character_positions, subject) mismatched_penalty = mismatched_penalty(score.matched_character_positions) - incompleteness_penalty = tuple_size(graphemes) - pattern_length - - result = - pattern_length_boost + consecutive_bonus + match_bonus + case_match_boost + - match_amount_boost - mismatched_penalty - incompleteness_penalty + incompleteness_penalty = tuple_size(graphemes) - length(score.matched_character_positions) - result + consecutive_bonus + match_boost + camel_case_boost + + match_amount_boost - mismatched_penalty - incompleteness_penalty end defp normalize(string) do @@ -193,27 +188,27 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do end @last_period_boost 15 - @max_match_bonus_boost 10 - defp match_bonus(%__MODULE__{} = score, subject(last_period_position: nil)) do + @max_match_boost_boost 10 + defp match_boost(%__MODULE__{} = score, subject(last_period_position: nil)) do # penalize first matches further in the string by making them negative. [first_match_position | _] = score.matched_character_positions - max(0 - first_match_position, @max_match_bonus_boost) + max(0 - first_match_position, @max_match_boost_boost) end - defp match_bonus(%__MODULE__{} = score, subject(last_period_position: last_period)) do + defp match_boost(%__MODULE__{} = score, subject(last_period_position: last_period)) do [first_match_position | _] = score.matched_character_positions if first_match_position == last_period + 1 do @last_period_boost else # penalize first matches further in the string by making them negative. - max(0 - first_match_position, @max_match_bonus_boost) + max(0 - first_match_position, @max_match_boost_boost) end end @consecutive_character_bonus 15 - def consecutive_match_bonus(matched_positions) do + def consecutive_match_boost(matched_positions) do # This function checks for consecutive matched characters, and # makes matches with more consecutive matched characters worth more. # This means if I type En, it will match Enum more than it will match @@ -254,29 +249,48 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do penalty end - defp case_match_boost(pattern, matched_positions, subject(graphemes: graphemes)) do - do_case_match_boost(pattern, matched_positions, graphemes, 0) + @camel_case_boost 5 + defp camel_case_boost(matched_positions, subject(graphemes: graphemes)) do + graphemes + |> Tuple.to_list() + |> camel_positions() + |> Enum.reduce(0, fn position, score -> + if position in matched_positions do + score + @camel_case_boost + else + score + end + end) end - # iterate over the matches, find the character in the subject with that index, and compare it - # to the one in the pattern, boost if they're the same. - defp do_case_match_boost(_, [], _, boost), do: boost + defp camel_positions(graphemes) do + camel_positions(graphemes, {nil, :lower}, 0, []) + end - defp do_case_match_boost(<>, [index | rest], graphemes, boost) do - boost = - if grapheme_to_utf8(graphemes, index) == char do - boost + 1 - else - boost - end + defp camel_positions([], _, _, positions) do + Enum.reverse(positions) + end + + defp camel_positions([grapheme | rest], {_last_char, :lower}, position, positions) do + case case_of(grapheme) do + :lower -> + camel_positions(rest, {grapheme, :lower}, position + 1, positions) - do_case_match_boost(pattern_rest, rest, graphemes, boost) + :upper -> + camel_positions(rest, {grapheme, :upper}, position + 1, [position | positions]) + end end - defp grapheme_to_utf8(graphemes, position) do - <> = elem(graphemes, position) + defp camel_positions([grapheme | rest], {_last_char, :upper}, position, positions) do + camel_positions(rest, {grapheme, case_of(grapheme)}, position + 1, positions) + end - c + defp case_of(grapheme) do + if String.downcase(grapheme) == grapheme do + :lower + else + :upper + end end defp last_period_position(string) do diff --git a/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs b/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs index 1c664e67d..6efe2e7ff 100644 --- a/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs +++ b/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs @@ -67,9 +67,14 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do assert results == ~w(abcd axbxcxdx axxbxxcxxdxx axxxbxxxcxxxdxxx) end - test "patterns that match the case are boosted" do - results = score_and_sort(~w(stinky stinkY StiNkY STINKY), "STINKY") - assert results == ~w(STINKY StiNkY stinkY stinky) + test "patterns that match camel case are boosted" do + results = + score_and_sort( + ~w(lotsofcamelcase LotsofcamelCase LotsofCamelCase LotsOfCamelCase), + "LotsOfCamelCase" + ) + + assert results == ~w(LotsOfCamelCase LotsofCamelCase LotsofcamelCase lotsofcamelcase) end test "matches at the end of a module are boosted" do @@ -81,5 +86,11 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do assert ["First.Second.Third" | _] = results end + + test "modules are boosted" do + results = score_and_sort(~w(create_user save_user Demo.Accounts.User), "User") + + assert ["Demo.Accounts.User" | _] = results + end end end From fc47eb09a6ee15ba210506c6a66e4db82ad9d0d1 Mon Sep 17 00:00:00 2001 From: Steve Cohen Date: Tue, 23 Apr 2024 08:48:31 -0700 Subject: [PATCH 3/5] Boost a tail match rather than the last period --- .../remote_control/search/fuzzy/scorer.ex | 43 ++++++++++--------- .../search/fuzzy/scorer_test.exs | 7 +-- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex index 62254a84f..4277890e0 100644 --- a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex +++ b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex @@ -23,7 +23,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do import Record - defrecord :subject, graphemes: nil, normalized: nil, last_period_position: nil + defrecord :subject, graphemes: nil, normalized: nil, period_positions: [-1] @typedoc "A match score. Higher numbers mean a more relevant match." @type score :: integer @@ -47,12 +47,11 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do |> List.to_tuple() normalized = normalize(subject) - last_period = last_period_position(normalized) subject( graphemes: graphemes, normalized: normalized, - last_period_position: last_period + period_positions: period_positions(normalized) ) end @@ -171,7 +170,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do match_amount_boost = consecutive_count * pattern_length - match_boost = match_boost(score, subject) + match_boost = match_boost(score, subject, pattern_length) camel_case_boost = camel_case_boost(score.matched_character_positions, subject) @@ -187,19 +186,23 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do String.downcase(string) end - @last_period_boost 15 + @tail_match_boost 55 @max_match_boost_boost 10 - defp match_boost(%__MODULE__{} = score, subject(last_period_position: nil)) do - # penalize first matches further in the string by making them negative. - [first_match_position | _] = score.matched_character_positions - max(0 - first_match_position, @max_match_boost_boost) - end - defp match_boost(%__MODULE__{} = score, subject(last_period_position: last_period)) do + defp match_boost( + %__MODULE__{} = score, + subject(graphemes: graphemes, period_positions: period_positions), + pattern_length + ) do [first_match_position | _] = score.matched_character_positions - if first_match_position == last_period + 1 do - @last_period_boost + match_end = first_match_position + pattern_length + subject_length = tuple_size(graphemes) + + if MapSet.member?(period_positions, first_match_position - 1) and match_end == subject_length do + # reward a complete match at the end of the last period. This is likely a module + # and the pattern matches the most local parts + @tail_match_boost else # penalize first matches further in the string by making them negative. max(0 - first_match_position, @max_match_boost_boost) @@ -293,17 +296,17 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do end end - defp last_period_position(string) do - last_period_position(string, 0, nil) + defp period_positions(string) do + period_positions(string, 0, [-1]) end - defp last_period_position(<<>>, _, position), do: position + defp period_positions(<<>>, _, positions), do: MapSet.new(positions) - defp last_period_position(<<".", rest::binary>>, position, _) do - last_period_position(rest, position + 1, position) + defp period_positions(<<".", rest::binary>>, position, positions) do + period_positions(rest, position + 1, [position | positions]) end - defp last_period_position(<<_::utf8, rest::binary>>, position, last_found) do - last_period_position(rest, position + 1, last_found) + defp period_positions(<<_::utf8, rest::binary>>, position, positions) do + period_positions(rest, position + 1, positions) end end diff --git a/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs b/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs index 6efe2e7ff..2faecc5e7 100644 --- a/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs +++ b/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs @@ -87,10 +87,11 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do assert ["First.Second.Third" | _] = results end - test "modules are boosted" do - results = score_and_sort(~w(create_user save_user Demo.Accounts.User), "User") + test "tail matches are boosted" do + results = + score_and_sort(~w(create_user save_user Foo.Bar.Baz.Demo.Accounts.LiveDemo.User), "User") - assert ["Demo.Accounts.User" | _] = results + assert ["Foo.Bar.Baz.Demo.Accounts.LiveDemo.User" | _] = results end end end From 01d9f11ac050f047cf9b027a21aa02689373c84c Mon Sep 17 00:00:00 2001 From: Steve Cohen Date: Tue, 23 Apr 2024 09:00:10 -0700 Subject: [PATCH 4/5] Returned 0 since we were always returning 10 anyways --- .../lib/lexical/remote_control/search/fuzzy/scorer.ex | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex index 4277890e0..03b1903ee 100644 --- a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex +++ b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex @@ -13,7 +13,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do 2. Patterns that match more consecutive characters 3. Patterns that match the beginning of the subject 4. Patterns that match the case of the subject - 5. Patterns that match after the last period + 5. Patterns that match the tail of a subject starting at the last period Based loosely on https://medium.com/@Srekel/implementing-a-fuzzy-search-algorithm-for-the-debuginator-cacc349e6c55 """ @@ -170,7 +170,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do match_amount_boost = consecutive_count * pattern_length - match_boost = match_boost(score, subject, pattern_length) + match_boost = tail_match_boost(score, subject, pattern_length) camel_case_boost = camel_case_boost(score.matched_character_positions, subject) @@ -187,9 +187,8 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do end @tail_match_boost 55 - @max_match_boost_boost 10 - defp match_boost( + defp tail_match_boost( %__MODULE__{} = score, subject(graphemes: graphemes, period_positions: period_positions), pattern_length @@ -204,8 +203,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do # and the pattern matches the most local parts @tail_match_boost else - # penalize first matches further in the string by making them negative. - max(0 - first_match_position, @max_match_boost_boost) + 0 end end From 06ede1b71e10996ac9c5df38bc0b6d677a3745d3 Mon Sep 17 00:00:00 2001 From: Steve Cohen Date: Wed, 24 Apr 2024 08:02:57 -0700 Subject: [PATCH 5/5] dropped min query length to 1 --- .../lib/lexical/server/provider/handlers/workspace_symbol.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/server/lib/lexical/server/provider/handlers/workspace_symbol.ex b/apps/server/lib/lexical/server/provider/handlers/workspace_symbol.ex index 97cd15f72..5fbeb0491 100644 --- a/apps/server/lib/lexical/server/provider/handlers/workspace_symbol.ex +++ b/apps/server/lib/lexical/server/provider/handlers/workspace_symbol.ex @@ -14,7 +14,7 @@ defmodule Lexical.Server.Provider.Handlers.WorkspaceSymbol do def handle(%WorkspaceSymbol{} = request, %Env{} = env) do symbols = - if String.length(request.query) > 3 do + if String.length(request.query) > 1 do env.project |> Api.workspace_symbols(request.query) |> tap(fn symbols -> Logger.info("syms #{inspect(Enum.take(symbols, 5))}") end)