From b2163681c0b87c36f29bba0583f9546c15c0dd38 Mon Sep 17 00:00:00 2001 From: Steve Cohen Date: Wed, 24 Apr 2024 08:10:30 -0700 Subject: [PATCH] Added boost for matches directly following the last period (#708) * Added boost for matches directly following the last period If something matches starting after the last period (of a likely module) it is boosted. This makes a match for `MyModule.Structs.User` take precedence over `MyModule.User.Something` while querying for `User` * Simplified scoring rules * Removed case match bonus in favor of camel case bonus * Removed pattern length bonus, since it's the same for all subjects * Changed incompleteness penalty to be based on the number of characters matched in the subject rather than the size of the pattern * dropped min query length to 1 --- .../remote_control/search/fuzzy/scorer.ex | 150 +++++++++++++----- .../search/fuzzy/scorer_test.exs | 32 +++- .../provider/handlers/workspace_symbol.ex | 2 +- 3 files changed, 136 insertions(+), 48 deletions(-) diff --git a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex index 2fee53bad..03b1903ee 100644 --- a/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex +++ b/apps/remote_control/lib/lexical/remote_control/search/fuzzy/scorer.ex @@ -13,6 +13,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do 2. Patterns that match more consecutive characters 3. Patterns that match the beginning of the subject 4. Patterns that match the case of the subject + 5. Patterns that match the tail of a subject starting at the last period Based loosely on https://medium.com/@Srekel/implementing-a-fuzzy-search-algorithm-for-the-debuginator-cacc349e6c55 """ @@ -22,7 +23,7 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do import Record - defrecord :subject, graphemes: nil, normalized: nil + defrecord :subject, graphemes: nil, normalized: nil, period_positions: [-1] @typedoc "A match score. Higher numbers mean a more relevant match." @type score :: integer @@ -45,7 +46,13 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do |> String.graphemes() |> List.to_tuple() - subject(graphemes: graphemes, normalized: normalize(subject)) + normalized = normalize(subject) + + subject( + graphemes: graphemes, + normalized: normalized, + period_positions: period_positions(normalized) + ) end def preprocess(subject) do @@ -85,20 +92,25 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do end end - defp collect_scores(normalized, normalized_pattern, acc \\ []) + defp collect_scores(normalized, normalized_pattern, starting_index \\ 0, acc \\ []) - defp collect_scores(normalized_subject, normalized_pattern, acc) do + defp collect_scores(normalized_subject, normalized_pattern, starting_index, scores) do # we collect scores because it's possible that a better match occurs later # in the subject, and if we start peeling off characters greedily, we'll miss # it. This is more expensive, but it's still pretty quick. - case do_score(normalized_subject, normalized_pattern, %__MODULE__{}) do + initial_score = %__MODULE__{index: starting_index} + + case do_score(normalized_subject, normalized_pattern, initial_score) do %__MODULE__{match?: true, matched_character_positions: [pos | _]} = score -> - subject_substring = String.slice(normalized_subject, (pos + 1)..-1//1) - collect_scores(subject_substring, normalized_pattern, [score | acc]) + slice_start = pos + 1 + next_index = starting_index + slice_start + subject_substring = String.slice(normalized_subject, slice_start..-1//1) + scores = [score | scores] + collect_scores(subject_substring, normalized_pattern, next_index, scores) _ -> - acc + scores end end @@ -150,36 +162,54 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do @non_match_score end - defp calculate_score(%__MODULE__{} = score, subject() = subject, pattern) do + defp calculate_score(%__MODULE__{} = score, subject(graphemes: graphemes) = subject, pattern) do pattern_length = String.length(pattern) {consecutive_count, consecutive_bonus} = - consecutive_match_bonus(score.matched_character_positions) - - match_amount_boost = consecutive_count * pattern_length * 10 - - [first_match_position | _] = score.matched_character_positions + consecutive_match_boost(score.matched_character_positions) - pattern_length_boost = pattern_length + match_amount_boost = consecutive_count * pattern_length - # penalize first matches further in the string by making them negative. - first_match_bonus = max(0 - first_match_position, 10) + match_boost = tail_match_boost(score, subject, pattern_length) - case_match_boost = case_match_boost(pattern, score.matched_character_positions, subject) + camel_case_boost = camel_case_boost(score.matched_character_positions, subject) mismatched_penalty = mismatched_penalty(score.matched_character_positions) - pattern_length_boost + consecutive_bonus + first_match_bonus + case_match_boost + - match_amount_boost - mismatched_penalty + incompleteness_penalty = tuple_size(graphemes) - length(score.matched_character_positions) + + consecutive_bonus + match_boost + camel_case_boost + + match_amount_boost - mismatched_penalty - incompleteness_penalty end defp normalize(string) do String.downcase(string) end + @tail_match_boost 55 + + defp tail_match_boost( + %__MODULE__{} = score, + subject(graphemes: graphemes, period_positions: period_positions), + pattern_length + ) do + [first_match_position | _] = score.matched_character_positions + + match_end = first_match_position + pattern_length + subject_length = tuple_size(graphemes) + + if MapSet.member?(period_positions, first_match_position - 1) and match_end == subject_length do + # reward a complete match at the end of the last period. This is likely a module + # and the pattern matches the most local parts + @tail_match_boost + else + 0 + end + end + @consecutive_character_bonus 15 - def consecutive_match_bonus(matched_positions) do + def consecutive_match_boost(matched_positions) do # This function checks for consecutive matched characters, and # makes matches with more consecutive matched characters worth more. # This means if I type En, it will match Enum more than it will match @@ -205,36 +235,76 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.Scorer do def mismatched_penalty(matched_positions) do {penalty, _} = matched_positions - |> Enum.reduce({0, -1}, fn matched_position, {penalty, last_match} -> - distance = matched_position - last_match - {penalty + distance * @mismatched_chracter_penalty, matched_position} + |> Enum.reduce({0, -1}, fn + matched_position, {0, _} -> + # only start counting the penalty after the first match, + # otherwise we will inadvertently penalize matches deeper in the string + {0, matched_position} + + matched_position, {penalty, last_match} -> + distance = matched_position - last_match + + {penalty + distance * @mismatched_chracter_penalty, matched_position} end) penalty end - defp case_match_boost(pattern, matched_positions, subject(graphemes: graphemes)) do - do_case_match_boost(pattern, matched_positions, graphemes, 0) + @camel_case_boost 5 + defp camel_case_boost(matched_positions, subject(graphemes: graphemes)) do + graphemes + |> Tuple.to_list() + |> camel_positions() + |> Enum.reduce(0, fn position, score -> + if position in matched_positions do + score + @camel_case_boost + else + score + end + end) end - # iterate over the matches, find the character in the subject with that index, and compare it - # to the one in the pattern, boost if they're the same. - defp do_case_match_boost(_, [], _, boost), do: boost + defp camel_positions(graphemes) do + camel_positions(graphemes, {nil, :lower}, 0, []) + end - defp do_case_match_boost(<>, [index | rest], graphemes, boost) do - boost = - if grapheme_to_utf8(graphemes, index) == char do - boost + 1 - else - boost - end + defp camel_positions([], _, _, positions) do + Enum.reverse(positions) + end + + defp camel_positions([grapheme | rest], {_last_char, :lower}, position, positions) do + case case_of(grapheme) do + :lower -> + camel_positions(rest, {grapheme, :lower}, position + 1, positions) + + :upper -> + camel_positions(rest, {grapheme, :upper}, position + 1, [position | positions]) + end + end + + defp camel_positions([grapheme | rest], {_last_char, :upper}, position, positions) do + camel_positions(rest, {grapheme, case_of(grapheme)}, position + 1, positions) + end + + defp case_of(grapheme) do + if String.downcase(grapheme) == grapheme do + :lower + else + :upper + end + end - do_case_match_boost(pattern_rest, rest, graphemes, boost) + defp period_positions(string) do + period_positions(string, 0, [-1]) end - defp grapheme_to_utf8(graphemes, position) do - <> = elem(graphemes, position) + defp period_positions(<<>>, _, positions), do: MapSet.new(positions) + + defp period_positions(<<".", rest::binary>>, position, positions) do + period_positions(rest, position + 1, [position | positions]) + end - c + defp period_positions(<<_::utf8, rest::binary>>, position, positions) do + period_positions(rest, position + 1, positions) end end diff --git a/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs b/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs index 638a1958a..2faecc5e7 100644 --- a/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs +++ b/apps/remote_control/test/lexical/remote_control/search/fuzzy/scorer_test.exs @@ -41,7 +41,6 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do end describe "matching heuristics" do - @tag :skip test "more complete matches are boosted" do results = score_and_sort( @@ -53,7 +52,6 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do ~w(Lexical.Document Lexical.Document.Range Something.Else.Lexical.Other.Type.Document.Thing) end - @tag :skip test "matches at the beginning of the string are boosted" do results = score_and_sort( @@ -64,16 +62,36 @@ defmodule Lexical.RemoteControl.Search.Fuzzy.ScorerTest do assert results == ~w(Document Something.Document Something.Else.Document) end - @tag :skip test "patterns that match consecutive characters are boosted" do results = score_and_sort(~w(axxxbxxxcxxxdxxx axxbxxcxxdxx axbxcxdx abcd), "abcd") assert results == ~w(abcd axbxcxdx axxbxxcxxdxx axxxbxxxcxxxdxxx) end - @tag :skip - test "patterns that match the case are boosted" do - results = score_and_sort(~w(stinky stinkY StiNkY STINKY), "STINKY") - assert results == ~w(STINKY StiNkY stinkY stinky) + test "patterns that match camel case are boosted" do + results = + score_and_sort( + ~w(lotsofcamelcase LotsofcamelCase LotsofCamelCase LotsOfCamelCase), + "LotsOfCamelCase" + ) + + assert results == ~w(LotsOfCamelCase LotsofCamelCase LotsofcamelCase lotsofcamelcase) + end + + test "matches at the end of a module are boosted" do + results = + score_and_sort( + ~w(First.Third.Second Third.First.Second First.Second.Third), + "Third" + ) + + assert ["First.Second.Third" | _] = results + end + + test "tail matches are boosted" do + results = + score_and_sort(~w(create_user save_user Foo.Bar.Baz.Demo.Accounts.LiveDemo.User), "User") + + assert ["Foo.Bar.Baz.Demo.Accounts.LiveDemo.User" | _] = results end end end diff --git a/apps/server/lib/lexical/server/provider/handlers/workspace_symbol.ex b/apps/server/lib/lexical/server/provider/handlers/workspace_symbol.ex index 97cd15f72..5fbeb0491 100644 --- a/apps/server/lib/lexical/server/provider/handlers/workspace_symbol.ex +++ b/apps/server/lib/lexical/server/provider/handlers/workspace_symbol.ex @@ -14,7 +14,7 @@ defmodule Lexical.Server.Provider.Handlers.WorkspaceSymbol do def handle(%WorkspaceSymbol{} = request, %Env{} = env) do symbols = - if String.length(request.query) > 3 do + if String.length(request.query) > 1 do env.project |> Api.workspace_symbols(request.query) |> tap(fn symbols -> Logger.info("syms #{inspect(Enum.take(symbols, 5))}") end)