Skip to content

Commit

Permalink
Merge pull request #74 from loupe-php/fix/relevance-for-non-existent-…
Browse files Browse the repository at this point in the history
…terms

Fixed relevance calculation for non-existent terms
  • Loading branch information
Toflar authored Mar 20, 2024
2 parents 00cd6ad + 4fe35f8 commit cda7035
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 8 deletions.
11 changes: 8 additions & 3 deletions src/Internal/CosineSimilarity.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,20 @@ class CosineSimilarity
*/
private static array $queryTfIdfsCache = [];

public static function fromQuery(string $queryId, string $queryIdfs, string $documentTfIdfs): float
public static function fromQuery(string $queryId, string $totalTokenCount, string $queryIdfs, string $documentTfIdfs): float
{
// First we have to turn the query term IDFs into TF-IDF (we only have to do this once per query, so we can cache that)
if (!isset(self::$queryTfIdfsCache[$queryId])) {
$queryIdfs = array_map('floatval', explode(',', $queryIdfs));
$tf = 1 / \count($queryIdfs);
self::$queryTfIdfsCache[$queryId] = array_map(function (float $idf) use ($tf) {

// Calculate the TF-IDF for the query. It's important that we pad the term matches to the total tokens searched
// so that terms that do not exist in our entire index are handled with a TF-IDF of 1. Otherwise, if you'd
// search for "foobar" and there is not a single document with "foobar" in your index, it would not be considered
// in the similarity giving you completely wrong results.
self::$queryTfIdfsCache[$queryId] = array_pad(array_map(function (float $idf) use ($tf) {
return $tf * $idf;
}, $queryIdfs);
}, $queryIdfs), (int) $totalTokenCount, 1);
}

return self::similarity(self::$queryTfIdfsCache[$queryId], array_map('floatval', explode(',', $documentTfIdfs)));
Expand Down
2 changes: 2 additions & 0 deletions src/Internal/Search/Sorting/Relevance.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ public function apply(Searcher $searcher, Engine $engine): void
*/
$select = sprintf(
'loupe_relevance(
%s,
%s,
(SELECT group_concat(idf) FROM %s),
(SELECT group_concat(tfidf) FROM %s WHERE %s.id=document)
) AS %s',
$searcher->getQueryBuilder()->createNamedParameter($searcher->getQueryId()),
$searcher->getQueryBuilder()->createNamedParameter(\count($searcher->getTokens()->allTermsWithVariants())),
Searcher::CTE_TERM_MATCHES,
Searcher::CTE_TERM_DOCUMENT_MATCHES,
$engine->getIndexInfo()->getAliasForTable(IndexInfo::TABLE_NAME_DOCUMENTS),
Expand Down
2 changes: 1 addition & 1 deletion src/LoupeFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ private function registerSQLiteFunctions(Connection $connection, string $sqliteV
],
'loupe_relevance' => [
'callback' => [CosineSimilarity::class, 'fromQuery'],
'numArgs' => 3,
'numArgs' => 4,
],
];

Expand Down
63 changes: 59 additions & 4 deletions tests/Functional/SearchTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -1228,14 +1228,14 @@ public function testPhraseSearch(): void
'id' => 11,
'title' => 'Star Wars',
],
[
'id' => 25,
'title' => 'Jarhead',
],
[
'id' => 28,
'title' => 'Apocalypse Now',
],
[
'id' => 25,
'title' => 'Jarhead',
],
],
'query' => 'I like Star Wars',
'hitsPerPage' => 20,
Expand Down Expand Up @@ -1469,6 +1469,61 @@ public function testRelevanceAndRankingScore(): void
]);
}

public function testRelevanceAndRankingScoreForNonExistentQueryTerms(): void
{
$configuration = Configuration::create()
->withSearchableAttributes(['content'])
->withSortableAttributes(['content'])
;

$loupe = $this->createLoupe($configuration);
$loupe->addDocuments([
[
'id' => 1,
'content' => 'The game of life is a game of everlasting learning',
],
[
'id' => 2,
'content' => 'The unexamined life is not worth living',
],
[
'id' => 3,
'content' => 'Never stop learning',
],
]);

$searchParameters = SearchParameters::create()
->withQuery('foobar life learning')
->withAttributesToRetrieve(['id', 'content'])
->withShowRankingScore(true)
;

$this->searchAndAssertResults($loupe, $searchParameters, [
'hits' => [
[
'id' => 1,
'content' => 'The game of life is a game of everlasting learning',
'_rankingScore' => 0.6301,
],
[
'id' => 3,
'content' => 'Never stop learning',
'_rankingScore' => 0.51447,
],
[
'id' => 2,
'content' => 'The unexamined life is not worth living',
'_rankingScore' => 0.36379,
],
],
'query' => 'foobar life learning',
'hitsPerPage' => 20,
'page' => 1,
'totalPages' => 1,
'totalHits' => 3,
]);
}

public function testSearchingForNumericArrayType(): void
{
$configuration = Configuration::create()
Expand Down

0 comments on commit cda7035

Please sign in to comment.