From 0a1ac0a00f4992a197c23972b7e74c4cd5c4f522 Mon Sep 17 00:00:00 2001 From: Petitoto <27863028+Petitoto@users.noreply.github.com> Date: Sun, 29 Sep 2024 16:18:01 +0200 Subject: [PATCH] Users fuzzy search enhancements (#541) ### Description This PR improves users fuzzy search results. Searching algorithm is refactored in a more simple way: - for each user: - attribute a score based on the query, the user attributes and a similarity algorithm - insert the score and its corresponding user in a sorted list - remove the lowest score of the list if its size is superior to the `limit` parameter (thus, always keeping the list sorted with the `N=limit` best results) The score of each user correspond to the highest Jaro-Winkler similarity between the query and: - firstname - name - firstname + name - name + firstname - nickname (if exists) This method aims to fit real searches: queries are often related to one of these 5 strings, but we don't know which one. The higher the similarity is between the query and one of them, the more likely the query is related to it. For well-constructed queries, false positives will always come after the good results. Before running the Jaro-Winkler algorithm, all strings are "unaccentuated" to make the similarity algorithm insensitive to accents. Moreover, queries from the `/users/search` endpoint are "capworded". We assume that queries from this endpoint are often the beginning of a name / firstname / nickname. This way, queries like `max` will better match `Maxou` than `Kmax`, which may better corresponds to the enduser's search. This method has proven to give better results on limited subsets of users (~30) and queries (~20), while keeping one of the highest performance. Other methods tested include: - previous `sort_user()` function - `SequenceMatcher` of the standard DiffLib library - Jaro-Winkler algorithm from RapidFuzz library (however, on longer strings, RapidFuzz pretend to be faster than Jellyfish) - Indel algorithm from RapidFuzz library - Damerau-Levenshtein algorithm from RapidFuzz library - `partial_ratio()` from RapidFuzz library - `token_ratio()` from RapidFuzz library - `partial_token_ratio()` from RapidFuzz library On the tested data with a `limit` parameter of 10, the new function introduced by this PR takes 50% more time than the old one. This is due to more similarities beeing computed (with 4 instead of 5, both take approximately the same time). However, it still seems to be reasonable for production use, and is better than other tested algorithms, especially when `list` or number of users in the database increase. ### Checklist - [ ] Created tests which fail without the change (if possible) - [x] All tests passing - [ ] Extended the documentation, if necessary --------- Co-authored-by: Petitoto --- app/core/users/endpoints_users.py | 9 ++--- app/utils/tools.py | 59 ++++++++++++------------------- 2 files changed, 27 insertions(+), 41 deletions(-) diff --git a/app/core/users/endpoints_users.py b/app/core/users/endpoints_users.py index c6ed08ae9..e74bde603 100644 --- a/app/core/users/endpoints_users.py +++ b/app/core/users/endpoints_users.py @@ -1,5 +1,6 @@ import logging import re +import string import uuid from datetime import UTC, datetime, timedelta @@ -103,9 +104,9 @@ async def search_users( user: models_core.CoreUser = Depends(is_user_an_ecl_member), ): """ - Search for a user using Jaro_Winkler distance algorithm. The - - `query` will be compared against users name, firstname and nickname + Search for a user using Jaro_Winkler distance algorithm. + The `query` will be compared against users name, firstname and nickname. + Assume that `query` is the beginning of a name, so we can capitalize words to improve results. **The user must be authenticated to use this endpoint** """ @@ -116,7 +117,7 @@ async def search_users( excluded_groups=excludedGroups, ) - return sort_user(query, users) + return sort_user(string.capwords(query), users) @router.get( diff --git a/app/utils/tools.py b/app/utils/tools.py index d261d3f41..d868d2975 100644 --- a/app/utils/tools.py +++ b/app/utils/tools.py @@ -1,7 +1,9 @@ +import bisect import logging import os import re import secrets +import unicodedata from collections.abc import Sequence from pathlib import Path from typing import TYPE_CHECKING, TypeVar @@ -80,50 +82,33 @@ def sort_user( Search for users using Fuzzy String Matching `query` will be compared against `users` name, firstname and nickname. + Accents will be ignored. The size of the answer can be limited using `limit` parameter. - Use Jellyfish library + Use Jaro-Winkler algorithm from Jellyfish library. """ - # TODO: we may want to cache this object. Its generation may take some time if there is a big user base - names = [f"{user.firstname} {user.name}" for user in users] - nicknames = [user.nickname for user in users] - scored: list[ - tuple[CoreUser, float, float, int] - ] = [ # (user, name_score, nickname_score, index) - ( - user, + def unaccent(s: str) -> str: + return unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("utf8") + + query = unaccent(query) + scored: list[tuple[CoreUser, float]] = [] + for user in users: + firstname = unaccent(user.firstname) + name = unaccent(user.name) + nickname = unaccent(user.nickname) if user.nickname else None + score = max( + jaro_winkler_similarity(query, firstname), jaro_winkler_similarity(query, name), + jaro_winkler_similarity(query, f"{firstname} {name}"), + jaro_winkler_similarity(query, f"{name} {firstname}"), jaro_winkler_similarity(query, nickname) if nickname else 0, - index, ) - for index, (user, name, nickname) in enumerate( - zip(users, names, nicknames, strict=True), - ) - ] - - results = [] - for _ in range(min(limit, len(scored))): - maximum_name = max(scored, key=lambda r: r[1]) - maximum_nickname = max(scored, key=lambda r: r[2]) - if maximum_name[1] > maximum_nickname[2]: - results.append(maximum_name) - scored[maximum_name[3]] = ( # We don't want to use this user again - maximum_name[0], - -1, - -1, - maximum_name[3], - ) - else: - results.append(maximum_nickname) - scored[maximum_nickname[3]] = ( # We don't want to use this user again - maximum_nickname[0], - -1, - -1, - maximum_nickname[3], - ) - - return [result[0] for result in results] + bisect.insort(scored, (user, score), key=(lambda s: s[1])) + if len(scored) > limit: + scored.pop(0) + + return [user for user, _ in reversed(scored)] async def is_group_id_valid(group_id: str, db: AsyncSession) -> bool: