From d141ddae22262541e397488ddeaa2de4a831e055 Mon Sep 17 00:00:00 2001 From: Evgeny Rubanenko Date: Sat, 4 Jan 2020 00:15:50 +0300 Subject: [PATCH 1/2] Find largest by absolute value --- gensim/similarities/docsim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 692d76e18a..b7a244e630 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -539,7 +539,7 @@ def convert(shard_no, doc): if not is_corpus: # user asked for num_best most similar and query is a single doc results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results)) - result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) + result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: abs(item[1])) else: # the trickiest combination: returning num_best results when query was a corpus results = [] @@ -548,7 +548,7 @@ def convert(shard_no, doc): results.append(shard_result) result = [] for parts in zip(*results): - merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1]) + merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: abs(item[1])) result.append(merged) if pool: # gc doesn't seem to collect the Pools, eventually leading to From 70bd975a2250bf3ef7312fd7ae2a0df2691e8c87 Mon Sep 17 00:00:00 2001 From: Evgeny Rubanenko Date: Sun, 5 Jan 2020 01:24:46 +0300 Subject: [PATCH 2/2] Add helper function to simplify code & add unit test for it --- gensim/similarities/docsim.py | 28 ++++++++++++++++++++++++++-- gensim/test/test_similarities.py | 6 ++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index b7a244e630..256f276394 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -233,6 +233,30 @@ def query_shard(args): return result +def _nlargest(n, iterable): + """Helper for extracting n documents with maximum similarity. + + Parameters + ---------- + n : int + Number of elements to be extracted + iterable : iterable of list of (int, float) + Iterable containing documents with computed similarities + + Returns + ------- + :class:`list` + List with the n largest elements from the dataset defined by iterable. + + Notes + ----- + Elements are compared by the absolute value of similarity, because negative value of similarity + does not mean some form of dissimilarity. + + """ + return heapq.nlargest(n, itertools.chain(*iterable), key=lambda item: abs(item[1])) + + class Similarity(interfaces.SimilarityABC): """Compute cosine similarity of a dynamic query against a corpus of documents ('the index'). @@ -539,7 +563,7 @@ def convert(shard_no, doc): if not is_corpus: # user asked for num_best most similar and query is a single doc results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results)) - result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: abs(item[1])) + result = _nlargest(self.num_best, results) else: # the trickiest combination: returning num_best results when query was a corpus results = [] @@ -548,7 +572,7 @@ def convert(shard_no, doc): results.append(shard_result) result = [] for parts in zip(*results): - merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: abs(item[1])) + merged = _nlargest(self.num_best, parts) result.append(merged) if pool: # gc doesn't seem to collect the Pools, eventually leading to diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index fbfd14880c..d25fc14da7 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -30,6 +30,7 @@ from gensim.similarities import UniformTermSimilarityIndex from gensim.similarities import SparseTermSimilarityMatrix from gensim.similarities import LevenshteinSimilarityIndex +from gensim.similarities.docsim import _nlargest from gensim.similarities.levenshtein import levdist, levsim try: @@ -532,6 +533,11 @@ def testChunksize(self): self.assertTrue(numpy.allclose(expected, sims)) index.destroy() + def testNlargest(self): + sims = ([(0, 0.8), (1, 0.2), (2, 0.0), (3, 0.0), (4, -0.1), (5, -0.15)],) + expected = [(0, 0.8), (1, 0.2), (5, -0.15)] + self.assertTrue(_nlargest(3, sims), expected) + class TestWord2VecAnnoyIndexer(unittest.TestCase):