diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index f5032f3c01..0d0aa2fe37 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -26,8 +26,6 @@ from gensim import interfaces from gensim.matutils import argsort -from gensim.models.ldamodel import LdaModel -from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet from gensim.topic_coherence import (segmentation, probability_estimation, direct_confirmation_measure, indirect_confirmation_measure, aggregation) @@ -279,23 +277,15 @@ def _topics_differ(self, new_topics): def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" - topics = [] - if isinstance(self.model, LdaModel): - for topic in self.model.state.get_lambda(): - bestn = argsort(topic, topn=self.topn, reverse=True) - topics.append(bestn) - elif isinstance(self.model, LdaVowpalWabbit): - for topic in self.model._get_topics(): - bestn = argsort(topic, topn=self.topn, reverse=True) - topics.append(bestn) - elif isinstance(self.model, LdaMallet): - for topic in self.model.word_topics: - bestn = argsort(topic, topn=self.topn, reverse=True) - topics.append(bestn) - else: - raise ValueError("This topic model is not currently supported. Supported topic models " - " are LdaModel, LdaVowpalWabbit and LdaMallet.") - return topics + try: + return [ + argsort(topic, topn=self.topn, reverse=True) for topic in + self.model.get_topics() + ] + except AttributeError: + raise ValueError( + "This topic model is not currently supported. Supported topic models" + " should implement the `get_topics` method.") def segment_topics(self): return self.measure.seg(self.topics) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 6937d928d4..0506fd13df 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -36,13 +36,14 @@ import logging import time import warnings + import numpy as np from scipy.special import gammaln, psi # gamma function utils +from six.moves import xrange from gensim import interfaces, utils, matutils from gensim.matutils import dirichlet_expectation from gensim.models import basemodel, ldamodel -from six.moves import xrange logger = logging.getLogger(__name__) @@ -456,6 +457,13 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No hdp_formatter = HdpTopicFormatter(self.id2word, betas) return hdp_formatter.show_topic(topic_id, topn, log, formatted) + def get_topics(self): + """ + Return the term topic matrix learned during inference. + This is a `num_topics` x `vocabulary_size` np.ndarray of floats. + """ + return self.m_lambda + self.m_eta + def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): """ Print the `num_words` most probable words for `num_topics` number of topics. diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 7b137284cd..9828e4d091 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -31,21 +31,21 @@ import logging -import numpy as np import numbers -from random import sample import os - -from gensim import interfaces, utils, matutils -from gensim.matutils import dirichlet_expectation -from gensim.models import basemodel -from gensim.matutils import kullback_leibler, hellinger, jaccard_distance - from itertools import chain +from random import sample + +import numpy as np +import six from scipy.special import gammaln, psi # gamma function utils from scipy.special import polygamma from six.moves import xrange -import six + +from gensim import interfaces, utils, matutils +from gensim.matutils import dirichlet_expectation +from gensim.matutils import kullback_leibler, hellinger, jaccard_distance +from gensim.models import basemodel # log(sum(exp(x))) that tries to avoid overflow try: @@ -815,6 +815,13 @@ def show_topic(self, topicid, topn=10): """ return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)] + def get_topics(self): + """ + Return the term topic matrix learned during inference. + This is a `num_topics` x `vocabulary_size` np.ndarray of floats. + """ + return self.state.get_lambda() + def get_topic_terms(self, topicid, topn=10): """ Return a list of `(word_id, probability)` 2-tuples for the most @@ -823,7 +830,7 @@ def get_topic_terms(self, topicid, topn=10): Only return 2-tuples for the topn most probable words (ignore the rest). """ - topic = self.state.get_lambda()[topicid] + topic = self.get_topics()[topicid] topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn, reverse=True) return [(id, topic[id]) for id in bestn] @@ -840,7 +847,7 @@ def top_topics(self, corpus, num_words=20): topics = [] str_topics = [] - for topic in self.state.get_lambda(): + for topic in self.get_topics(): topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn=num_words, reverse=True) topics.append(bestn) @@ -1013,7 +1020,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 raise ValueError("The parameter `other` must be of type `{}`".format(self.__name__)) distance_func = distances[distance] - d1, d2 = self.state.get_lambda(), other.state.get_lambda() + d1, d2 = self.get_topics(), other.get_topics() t1_size, t2_size = d1.shape[0], d2.shape[0] annotation_terms = None diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index bc0cc4fb0f..8f83befdd7 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -57,13 +57,11 @@ import scipy.linalg import scipy.sparse from scipy.sparse import sparsetools - -from gensim import interfaces, matutils, utils -from gensim.models import basemodel - from six import iterkeys from six.moves import xrange +from gensim import interfaces, matutils, utils +from gensim.models import basemodel logger = logging.getLogger(__name__) @@ -470,6 +468,20 @@ def __getitem__(self, bow, scaled=False, chunksize=512): result = matutils.Dense2Corpus(topic_dist) return result + def get_topics(self): + """ + Return the term topic matrix learned during inference. + This is a `num_topics` x `vocabulary_size` np.ndarray of floats. + + NOTE: The number of topics can actually be smaller than `self.num_topics`, + if there were not enough factors (real rank of input matrix smaller than + `self.num_topics`). + """ + projections = self.projection.u.T + num_topics = len(projections) + topics = [np.asarray(projections[i, :]).flatten() for i in range(num_topics)] + return np.array(topics) + def show_topic(self, topicno, topn=10): """ Return a specified topic (=left singular vector), 0 <= `topicno` < `self.num_topics`, diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 5276b035f1..9ece62b03c 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -30,22 +30,19 @@ import logging +import os import random import tempfile -import os - -import numpy - import xml.etree.ElementTree as et import zipfile -from six import iteritems +import numpy from smart_open import smart_open from gensim import utils, matutils -from gensim.utils import check_output, revdict -from gensim.models.ldamodel import LdaModel from gensim.models import basemodel +from gensim.models.ldamodel import LdaModel +from gensim.utils import check_output, revdict logger = logging.getLogger(__name__) @@ -213,6 +210,13 @@ def load_document_topics(self): """ return self.read_doctopics(self.fdoctopics()) + def get_topics(self): + """ + Return the term topic matrix learned during inference. + This is a `num_topics` x `vocabulary_size` np.ndarray of floats. + """ + return self.word_topics + def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): """ Print the `num_words` most probable words for `num_topics` number of topics. diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index 6d6ae9e275..978f541faa 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -53,15 +53,15 @@ .. [2] http://www.cs.princeton.edu/~mdhoffma/ """ -from __future__ import unicode_literals -from __future__ import print_function from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals -import os import logging -import tempfile +import os import shutil import subprocess +import tempfile import numpy @@ -235,6 +235,13 @@ def log_perplexity(self, chunk): corpus_words) return bound + def get_topics(self): + """ + Return the term topic matrix learned during inference. + This is a `num_topics` x `vocabulary_size` np.ndarray of floats. + """ + return self._get_topics() + def print_topics(self, num_topics=10, num_words=10): return self.show_topics(num_topics, num_words, log=True) diff --git a/gensim/test/basetests.py b/gensim/test/basetests.py index 4032f6ff21..a370feb06c 100644 --- a/gensim/test/basetests.py +++ b/gensim/test/basetests.py @@ -8,8 +8,9 @@ Automated tests for checking transformation algorithms (the models package). """ -import six import numpy as np +import six + class TestBaseTopicModel(object): def testPrintTopic(self): @@ -41,3 +42,12 @@ def testShowTopics(self): for k, v in topic: self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, (np.floating, float))) + + def testGetTopics(self): + topics = self.model.get_topics() + vocab_size = len(self.model.id2word) + for topic in topics: + self.assertTrue(isinstance(topic, np.ndarray)) + self.assertEqual(topic.dtype, np.float64) + self.assertEqual(vocab_size, topic.shape[0]) +