Skip to content

Commit

Permalink
Reduce memory consumption of summarizer (#2298)
Browse files Browse the repository at this point in the history
* reduce memory consumption

* fix build

* add deleting null-weighted edge

* iterate over bm25 weights

* fix build

* fix wrong example

* add method iter_graph in Graph

* add index argument in SyntacticUnit

* fix logging messages

* refactor graph - remove unnecessary parts

* Add test and fix typos

* fix built (flake8)

* add printing of documents number
  • Loading branch information
horpto authored and menshikh-iv committed Jan 18, 2019
1 parent 239856c commit fabeffe
Show file tree
Hide file tree
Showing 8 changed files with 280 additions and 201 deletions.
89 changes: 88 additions & 1 deletion gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,48 @@ def get_scores(self, document):
scores = [self.get_score(document, index) for index in range(self.corpus_size)]
return scores

def get_scores_bow(self, document):
"""Computes and returns BM25 scores of given `document` in relation to
every item in corpus.
Parameters
----------
document : list of str
Document to be scored.
Returns
-------
list of float
BM25 scores.
"""
scores = []
for index in range(self.corpus_size):
score = self.get_score(document, index)
if score > 0:
scores.append((index, score))
return scores


def _get_scores_bow(bm25, document):
"""Helper function for retrieving bm25 scores of given `document` in parallel
in relation to every item in corpus.
Parameters
----------
bm25 : BM25 object
BM25 object fitted on the corpus where documents are retrieved.
document : list of str
Document to be scored.
Returns
-------
list of (index, float)
BM25 scores in a bag of weights format.
"""
return bm25.get_scores_bow(document)


def _get_scores(bm25, document):
"""Helper function for retrieving bm25 scores of given `document` in parallel
Expand All @@ -183,6 +225,52 @@ def _get_scores(bm25, document):
return bm25.get_scores(document)


def iter_bm25_bow(corpus, n_jobs=1):
"""Yield BM25 scores (weights) of documents in corpus.
Each document has to be weighted with every document in given corpus.
Parameters
----------
corpus : list of list of str
Corpus of documents.
n_jobs : int
The number of processes to use for computing bm25.
Yields
-------
list of (index, float)
BM25 scores in bag of weights format.
Examples
--------
.. sourcecode:: pycon
>>> from gensim.summarization.bm25 import iter_bm25_weights
>>> corpus = [
... ["black", "cat", "white", "cat"],
... ["cat", "outer", "space"],
... ["wag", "dog"]
... ]
>>> result = iter_bm25_weights(corpus, n_jobs=-1)
"""
bm25 = BM25(corpus)

n_processes = effective_n_jobs(n_jobs)
if n_processes == 1:
for doc in corpus:
yield bm25.get_scores_bow(doc)
return

get_score = partial(_get_scores_bow, bm25)
pool = Pool(n_processes)

for bow in pool.imap(get_score, corpus):
yield bow
pool.close()
pool.join()


def get_bm25_weights(corpus, n_jobs=1):
"""Returns BM25 scores (weights) of documents in corpus.
Each document has to be weighted with every document in given corpus.
Expand Down Expand Up @@ -224,5 +312,4 @@ def get_bm25_weights(corpus, n_jobs=1):
weights = pool.map(get_score, corpus)
pool.close()
pool.join()

return weights
Loading

0 comments on commit fabeffe

Please sign in to comment.