diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 0375eded69..7b137284cd 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -971,7 +971,7 @@ def get_term_topics(self, word_id, minimum_probability=None): return values - def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, normed=True): + def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, diagonal=False, annotation=True, normed=True): """ Calculate difference topic2topic between two Lda models `other` instances of `LdaMulticore` or `LdaModel` @@ -979,8 +979,10 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 Available values: `kullback_leibler`, `hellinger` and `jaccard` `num_words` is quantity of most relevant words that used if distance == `jaccard` (also used for annotation) `n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation) + `diagonal` set to True if the difference is required only between the identical topic no.s (returns diagonal of diff matrix) + `annotation` whether the intersection or difference of words between two topics should be returned Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j - and matrix annotation with shape (m1.num_topics, m2.num_topics, 2, None), + and matrix annotation (if True) with shape (m1.num_topics, m2.num_topics, 2, None), where: annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and @@ -1013,6 +1015,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 distance_func = distances[distance] d1, d2 = self.state.get_lambda(), other.state.get_lambda() t1_size, t2_size = d1.shape[0], d2.shape[0] + annotation_terms = None fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in xrange(t1_size)] snd_topics = [{w for (w, _) in other.show_topic(topic, topn=num_words)} for topic in xrange(t2_size)] @@ -1020,28 +1023,41 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 if distance == "jaccard": d1, d2 = fst_topics, snd_topics - z = np.zeros((t1_size, t2_size)) - for topic1 in range(t1_size): - for topic2 in range(t2_size): - z[topic1][topic2] = distance_func(d1[topic1], d2[topic2]) - - if normed: - if np.abs(np.max(z)) > 1e-8: - z /= np.max(z) - - annotation = [[None] * t1_size for _ in range(t2_size)] + if diagonal: + assert t1_size == t2_size, "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix" + # initialize z and annotation array + z = np.zeros(t1_size) + if annotation: + annotation_terms = np.zeros(t1_size, dtype=list) + else: + # initialize z and annotation matrix + z = np.zeros((t1_size, t2_size)) + if annotation: + annotation_terms = np.zeros((t1_size, t2_size), dtype=list) + + # iterate over each cell in the initialized z and annotation + for topic in np.ndindex(z.shape): + topic1 = topic[0] + if diagonal: + topic2 = topic1 + else: + topic2 = topic[1] - for topic1 in range(t1_size): - for topic2 in range(t2_size): + z[topic] = distance_func(d1[topic1], d2[topic2]) + if annotation: pos_tokens = fst_topics[topic1] & snd_topics[topic2] neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2]) pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms)) neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms)) - annotation[topic1][topic2] = [pos_tokens, neg_tokens] + annotation_terms[topic] = [pos_tokens, neg_tokens] + + if normed: + if np.abs(np.max(z)) > 1e-8: + z /= np.max(z) - return z, annotation + return z, annotation_terms def __getitem__(self, bow, eps=None): """ diff --git a/gensim/test/test_tmdiff.py b/gensim/test/test_tmdiff.py index 5ab0c0fac7..d6b60e8721 100644 --- a/gensim/test/test_tmdiff.py +++ b/gensim/test/test_tmdiff.py @@ -4,6 +4,7 @@ # Copyright (C) 2016 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +import logging import unittest import numpy as np @@ -31,14 +32,22 @@ def setUp(self): self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10) def testBasic(self): + # test for matrix case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms) self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics)) self.assertEquals(len(annotation), self.num_topics) self.assertEquals(len(annotation[0]), self.num_topics) + # test for diagonal case + mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True) + + self.assertEqual(mdiff.shape, (self.num_topics,)) + self.assertEquals(len(annotation), self.num_topics) + def testIdentity(self): for dist_name in ["hellinger", "kullback_leibler", "jaccard"]: + # test for matrix case mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name) for row in annotation: @@ -51,6 +60,23 @@ def testIdentity(self): if dist_name == "jaccard": self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) + # test for diagonal case + mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True) + + for (int_tokens, diff_tokens) in annotation: + self.assertEquals(diff_tokens, []) + self.assertEquals(len(int_tokens), self.n_ann_terms) + + self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) + + if dist_name == "jaccard": + self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype))) + def testInput(self): self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something') self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something') + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main()