diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 692d76e18a..256f276394 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -233,6 +233,30 @@ def query_shard(args): return result +def _nlargest(n, iterable): + """Helper for extracting n documents with maximum similarity. + + Parameters + ---------- + n : int + Number of elements to be extracted + iterable : iterable of list of (int, float) + Iterable containing documents with computed similarities + + Returns + ------- + :class:`list` + List with the n largest elements from the dataset defined by iterable. + + Notes + ----- + Elements are compared by the absolute value of similarity, because negative value of similarity + does not mean some form of dissimilarity. + + """ + return heapq.nlargest(n, itertools.chain(*iterable), key=lambda item: abs(item[1])) + + class Similarity(interfaces.SimilarityABC): """Compute cosine similarity of a dynamic query against a corpus of documents ('the index'). @@ -539,7 +563,7 @@ def convert(shard_no, doc): if not is_corpus: # user asked for num_best most similar and query is a single doc results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results)) - result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) + result = _nlargest(self.num_best, results) else: # the trickiest combination: returning num_best results when query was a corpus results = [] @@ -548,7 +572,7 @@ def convert(shard_no, doc): results.append(shard_result) result = [] for parts in zip(*results): - merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1]) + merged = _nlargest(self.num_best, parts) result.append(merged) if pool: # gc doesn't seem to collect the Pools, eventually leading to diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index fbfd14880c..d25fc14da7 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -30,6 +30,7 @@ from gensim.similarities import UniformTermSimilarityIndex from gensim.similarities import SparseTermSimilarityMatrix from gensim.similarities import LevenshteinSimilarityIndex +from gensim.similarities.docsim import _nlargest from gensim.similarities.levenshtein import levdist, levsim try: @@ -532,6 +533,11 @@ def testChunksize(self): self.assertTrue(numpy.allclose(expected, sims)) index.destroy() + def testNlargest(self): + sims = ([(0, 0.8), (1, 0.2), (2, 0.0), (3, 0.0), (4, -0.1), (5, -0.15)],) + expected = [(0, 0.8), (1, 0.2), (5, -0.15)] + self.assertTrue(_nlargest(3, sims), expected) + class TestWord2VecAnnoyIndexer(unittest.TestCase):