Reduce memory consumption of summarizer (#2298)

* reduce memory consumption * fix build * add deleting null-weighted edge * iterate over bm25 weights * fix build * fix wrong example * add method iter_graph in Graph * add index argument in SyntacticUnit * fix logging messages * refactor graph - remove unnecessary parts * Add test and fix typos * fix built (flake8) * add printing of documents number
piskvorky · Jan 18, 2019 · fabeffe · fabeffe
1 parent 239856c
commit fabeffe
Show file tree

Hide file tree

Showing 8 changed files with 280 additions and 201 deletions.
diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
@@ -162,6 +162,48 @@ def get_scores(self, document):
         scores = [self.get_score(document, index) for index in range(self.corpus_size)]
         return scores
 
+    def get_scores_bow(self, document):
+        """Computes and returns BM25 scores of given `document` in relation to
+        every item in corpus.
+
+        Parameters
+        ----------
+        document : list of str
+            Document to be scored.
+
+        Returns
+        -------
+        list of float
+            BM25 scores.
+
+        """
+        scores = []
+        for index in range(self.corpus_size):
+            score = self.get_score(document, index)
+            if score > 0:
+                scores.append((index, score))
+        return scores
+
+
+def _get_scores_bow(bm25, document):
+    """Helper function for retrieving bm25 scores of given `document` in parallel
+    in relation to every item in corpus.
+
+    Parameters
+    ----------
+    bm25 : BM25 object
+        BM25 object fitted on the corpus where documents are retrieved.
+    document : list of str
+        Document to be scored.
+
+    Returns
+    -------
+    list of (index, float)
+        BM25 scores in a bag of weights format.
+
+    """
+    return bm25.get_scores_bow(document)
+
 
 def _get_scores(bm25, document):
     """Helper function for retrieving bm25 scores of given `document` in parallel
@@ -183,6 +225,52 @@ def _get_scores(bm25, document):
     return bm25.get_scores(document)
 
 
+def iter_bm25_bow(corpus, n_jobs=1):
+    """Yield BM25 scores (weights) of documents in corpus.
+    Each document has to be weighted with every document in given corpus.
+
+    Parameters
+    ----------
+    corpus : list of list of str
+        Corpus of documents.
+    n_jobs : int
+        The number of processes to use for computing bm25.
+
+    Yields
+    -------
+    list of (index, float)
+        BM25 scores in bag of weights format.
+
+    Examples
+    --------
+    .. sourcecode:: pycon
+
+        >>> from gensim.summarization.bm25 import iter_bm25_weights
+        >>> corpus = [
+        ...     ["black", "cat", "white", "cat"],
+        ...     ["cat", "outer", "space"],
+        ...     ["wag", "dog"]
+        ... ]
+        >>> result = iter_bm25_weights(corpus, n_jobs=-1)
+
+    """
+    bm25 = BM25(corpus)
+
+    n_processes = effective_n_jobs(n_jobs)
+    if n_processes == 1:
+        for doc in corpus:
+            yield bm25.get_scores_bow(doc)
+        return
+
+    get_score = partial(_get_scores_bow, bm25)
+    pool = Pool(n_processes)
+
+    for bow in pool.imap(get_score, corpus):
+        yield bow
+    pool.close()
+    pool.join()
+
+
 def get_bm25_weights(corpus, n_jobs=1):
     """Returns BM25 scores (weights) of documents in corpus.
     Each document has to be weighted with every document in given corpus.
@@ -224,5 +312,4 @@ def get_bm25_weights(corpus, n_jobs=1):
     weights = pool.map(get_score, corpus)
     pool.close()
     pool.join()
-
     return weights