diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index b9621aeb1e..5687d0d2a1 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -37,6 +37,7 @@ """ +import logging import math from six import iteritems from six.moves import range @@ -48,6 +49,8 @@ PARAM_B = 0.75 EPSILON = 0.25 +logger = logging.getLogger(__name__) + class BM25(object): """Implementation of Best Matching 25 ranking function. @@ -116,6 +119,13 @@ def _initialize(self, corpus): negative_idfs.append(word) self.average_idf = float(idf_sum) / len(self.idf) + if self.average_idf < 0: + logger.warning( + 'Average inverse document frequency is less than zero. Your corpus of {} documents' + ' is either too small or it does not originate from actual text documents. BM25' + ' will likely produce "wrong" results.'.format(self.corpus_size) + ); + eps = EPSILON * self.average_idf for word in negative_idfs: self.idf[word] = eps