Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change similarity strategy when finding n best #2720

Merged
merged 2 commits into from
Jan 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions gensim/similarities/docsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,30 @@ def query_shard(args):
return result


def _nlargest(n, iterable):
"""Helper for extracting n documents with maximum similarity.

Parameters
----------
n : int
Number of elements to be extracted
iterable : iterable of list of (int, float)
Iterable containing documents with computed similarities

Returns
-------
:class:`list`
List with the n largest elements from the dataset defined by iterable.

Notes
-----
Elements are compared by the absolute value of similarity, because negative value of similarity
does not mean some form of dissimilarity.

"""
return heapq.nlargest(n, itertools.chain(*iterable), key=lambda item: abs(item[1]))


class Similarity(interfaces.SimilarityABC):
"""Compute cosine similarity of a dynamic query against a corpus of documents ('the index').

Expand Down Expand Up @@ -539,7 +563,7 @@ def convert(shard_no, doc):
if not is_corpus:
# user asked for num_best most similar and query is a single doc
results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results))
result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
result = _nlargest(self.num_best, results)
else:
# the trickiest combination: returning num_best results when query was a corpus
results = []
Expand All @@ -548,7 +572,7 @@ def convert(shard_no, doc):
results.append(shard_result)
result = []
for parts in zip(*results):
merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1])
merged = _nlargest(self.num_best, parts)
result.append(merged)
if pool:
# gc doesn't seem to collect the Pools, eventually leading to
Expand Down
6 changes: 6 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from gensim.similarities import UniformTermSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import LevenshteinSimilarityIndex
from gensim.similarities.docsim import _nlargest
from gensim.similarities.levenshtein import levdist, levsim

try:
Expand Down Expand Up @@ -532,6 +533,11 @@ def testChunksize(self):
self.assertTrue(numpy.allclose(expected, sims))
index.destroy()

def testNlargest(self):
sims = ([(0, 0.8), (1, 0.2), (2, 0.0), (3, 0.0), (4, -0.1), (5, -0.15)],)
expected = [(0, 0.8), (1, 0.2), (5, -0.15)]
self.assertTrue(_nlargest(3, sims), expected)


class TestWord2VecAnnoyIndexer(unittest.TestCase):

Expand Down