Skip to content

Commit

Permalink
Use jaro_winkler similarity instead of rapidfuzz (#491)
Browse files Browse the repository at this point in the history
### Description

Use jaro_winkler similarity from jellyfish module instead of rapidfuzz.
It has the same speed and better results
  • Loading branch information
Rotheem authored Aug 6, 2024
1 parent 6da8c60 commit cadf2f1
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 23 deletions.
10 changes: 7 additions & 3 deletions app/core/users/endpoints_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@
)
from app.types.content_type import ContentType
from app.utils.mail.mailworker import send_email
from app.utils.tools import fuzzy_search_user, get_file_from_data, save_file_as_data
from app.utils.tools import (
get_file_from_data,
save_file_as_data,
sort_user,
)

router = APIRouter(tags=["Users"])

Expand Down Expand Up @@ -95,7 +99,7 @@ async def search_users(
user: models_core.CoreUser = Depends(is_user_an_ecl_member),
):
"""
Search for a user using Fuzzy String Matching
Search for a user using Jaro_Winkler distance algorithm. The
`query` will be compared against users name, firstname and nickname
Expand All @@ -108,7 +112,7 @@ async def search_users(
excluded_groups=excludedGroups,
)

return fuzzy_search_user(query, users)
return sort_user(query, users)


@router.get(
Expand Down
60 changes: 41 additions & 19 deletions app/utils/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
import fitz
from fastapi import HTTPException, UploadFile
from fastapi.responses import FileResponse
from jellyfish import jaro_winkler_similarity
from pydantic import ValidationError
from rapidfuzz import process
from sqlalchemy.ext.asyncio import AsyncSession

from app.core import cruds_core, models_core
Expand Down Expand Up @@ -60,7 +60,7 @@ def is_user_external(
return user.external is True


def fuzzy_search_user(
def sort_user(
query: str,
users: Sequence[models_core.CoreUser],
limit: int = 10,
Expand All @@ -71,26 +71,48 @@ def fuzzy_search_user(
`query` will be compared against `users` name, firstname and nickname.
The size of the answer can be limited using `limit` parameter.
Use RapidFuzz library
Use Jellyfish library
"""

# We can give a dictionary of {object: string used for the comparison} to the extract function
# https://maxbachmann.github.io/RapidFuzz/Usage/process.html#extract

# TODO: we may want to cache this object. Its generation may take some time if there is a big user base
choices = []

for user in users:
choices.append(f"{user.firstname} {user.name} {user.nickname}")

results: list[tuple[str, int | float, int]] = process.extract(
query,
choices,
limit=limit,
)

# results has the format : (string used for the comparison, similarity score, index of the object in the choices collection)
return [users[res[2]] for res in results]
names = [f"{user.firstname} {user.name}" for user in users]
nicknames = [user.nickname for user in users]
scored: list[
tuple[CoreUser, float, float, int]
] = [ # (user, name_score, nickname_score, index)
(
user,
jaro_winkler_similarity(query, name),
jaro_winkler_similarity(query, nickname) if nickname else 0,
index,
)
for index, (user, name, nickname) in enumerate(
zip(users, names, nicknames, strict=True),
)
]

results = []
for _ in range(min(limit, len(scored))):
maximum_name = max(scored, key=lambda r: r[1])
maximum_nickname = max(scored, key=lambda r: r[2])
if maximum_name[1] > maximum_nickname[1]:
results.append(maximum_name)
scored[maximum_name[3]] = ( # We don't want to use this user again
maximum_name[0],
-1,
-1,
maximum_name[3],
)
else:
results.append(maximum_nickname)
scored[maximum_nickname[3]] = ( # We don't want to use this user again
maximum_nickname[0],
-1,
-1,
maximum_nickname[3],
)

return [result[0] for result in results]


async def is_group_id_valid(group_id: str, db: AsyncSession) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ fastapi==0.111.0
firebase-admin==6.5.0 # Firebase is used for push notification
HelloAssoAPIWrapper==1.0.0
icalendar==5.0.13
jellyfish==1.0.4 # String Matching
Jinja2==3.1.4 # template engine for html files
pydantic-settings==2.3.4
pydantic==2.7.4
pyjwt[crypto]==2.8.0 # generate and verify the JWT tokens, imported as `jwt`
PyMuPDF==1.24.9 # PDF processing
python-dotenv==1.0.1 # load environment variables from .env file
python-multipart==0.0.9 # a form data parser, as oauth flow requires form-data parameters
rapidfuzz==3.9.5 # Fuzzy String Matching
redis==5.0.8
requests==2.32.3
SQLAlchemy[asyncio]==2.0.32 # [asyncio] allows greenlet to be installed on Apple M1 devices.
Expand Down

0 comments on commit cadf2f1

Please sign in to comment.