diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 592cc9537c..1b4aea863b 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -28,8 +28,7 @@ def precompute_idfs(wglobal, dfs, total_docs): """Precompute the inverse document frequency mapping for all terms.""" # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. - return dict((termid, wglobal(df, total_docs)) - for termid, df in iteritems(dfs)) + return dict((termid, wglobal(df, total_docs)) for termid, df in iteritems(dfs)) class TfidfModel(interfaces.TransformationABC): @@ -49,8 +48,9 @@ class TfidfModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, corpus=None, id2word=None, dictionary=None, - wlocal=utils.identity, wglobal=df2idf, normalize=True): + def __init__( + self, corpus=None, id2word=None, dictionary=None, + wlocal=utils.identity, wglobal=df2idf, normalize=True): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing @@ -87,11 +87,13 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, # statistics we need to construct the IDF mapping. we can skip the # step that goes through the corpus (= an optimization). if corpus is not None: - logger.warning("constructor received both corpus and explicit " - "inverse document frequencies; ignoring the corpus") + logger.warning( + "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus") self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz self.dfs = dictionary.dfs.copy() self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) + if id2word is None: + self.id2word = dictionary elif corpus is not None: self.initialize(corpus) else: @@ -114,7 +116,7 @@ def initialize(self, corpus): numnnz, docno = 0, -1 for docno, bow in enumerate(corpus): if docno % 10000 == 0: - logger.info("PROGRESS: processing document #%i" % docno) + logger.info("PROGRESS: processing document #%i", docno) numnnz += len(bow) for termid, _ in bow: dfs[termid] = dfs.get(termid, 0) + 1 @@ -126,8 +128,9 @@ def initialize(self, corpus): # and finally compute the idf weights n_features = max(dfs) if dfs else 0 - logger.info("calculating IDF weights for %i documents and %i features (%i matrix non-zeros)" % - (self.num_docs, n_features, self.num_nnz)) + logger.info( + "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", + self.num_docs, n_features, self.num_nnz) self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) @@ -142,8 +145,10 @@ def __getitem__(self, bow, eps=1e-12): # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) - vector = [(termid, self.wlocal(tf) * self.idfs.get(termid)) - for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] + vector = [ + (termid, self.wlocal(tf) * self.idfs.get(termid)) + for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0 + ] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function