diff --git a/docs/notebooks/dtm_example.ipynb b/docs/notebooks/dtm_example.ipynb index 5cb5cad31d..03ecc0a53f 100644 --- a/docs/notebooks/dtm_example.ipynb +++ b/docs/notebooks/dtm_example.ipynb @@ -41,15 +41,7 @@ "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DEBUG:root:test\n" - ] - } - ], + "outputs": [], "source": [ "logger = logging.getLogger()\n", "logger.setLevel(logging.DEBUG)\n", @@ -110,16 +102,7 @@ "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])\n", - "INFO:gensim.corpora.dictionary:built Dictionary(546 unique tokens: [u'semantic', u'global', u'dynamic', u'focus', u'executing']...) from 10 documents (total 1112 corpus positions)\n" - ] - } - ], + "outputs": [], "source": [ "class DTMcorpus(corpora.textcorpus.TextCorpus):\n", "\n", @@ -171,20 +154,7 @@ "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to /tmp/fc5bec_train-mult.dat\n", - "INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus\n", - "INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into /tmp/fc5bec_train-mult.dat\n", - "INFO:gensim.corpora.bleicorpus:saving vocabulary of 546 words to /tmp/fc5bec_train-mult.dat.vocab\n", - "INFO:gensim.models.wrappers.dtmmodel:training DTM with args --ntopics=2 --model=dtm --mode=fit --initialize_lda=true --corpus_prefix=/tmp/fc5bec_train --outname=/tmp/fc5bec_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6 --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 \n", - "INFO:gensim.models.wrappers.dtmmodel:Running command ['/home/bhargav/dtm/main', '--ntopics=2', '--model=dtm', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/fc5bec_train', '--outname=/tmp/fc5bec_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--top_chain_var=0.005', '--rng_seed=0']\n" - ] - } - ], + "outputs": [], "source": [ "model = DtmModel(dtm_path, corpus, time_seq, num_topics=2,\n", " id2word=corpus.dictionary, initialize_lda=True)" @@ -205,7 +175,7 @@ }, "outputs": [], "source": [ - "topics = model.show_topic(topicid=1, time=1, topn=10)" + "topics = model.show_topic(topicid=1, time=1, num_words=10)" ] }, { @@ -218,16 +188,16 @@ { "data": { "text/plain": [ - "[(0.036994582002755308, u'skills'),\n", - " (0.035650395084747225, u'engineering'),\n", - " (0.020960209762361768, u'knowledge'),\n", - " (0.017327107555455742, u'electrical'),\n", - " (0.017047119686861439, u'technical'),\n", - " (0.015645884634432328, u'teams'),\n", - " (0.013631466899125954, u'testing'),\n", - " (0.012571386996720339, u'complex'),\n", - " (0.011803114081816798, u'management'),\n", - " (0.010380921159700848, u'customer')]" + "[(0.023565028919164586, 'skills'),\n", + " (0.02308969736545094, 'engineering'),\n", + " (0.019616329462533579, 'idexx'),\n", + " (0.0194313503731963, 'testing'),\n", + " (0.01858957362093603, 'technical'),\n", + " (0.017685337300946517, 'electrical'),\n", + " (0.017483543705882995, 'management'),\n", + " (0.015310984365058886, 'complex'),\n", + " (0.014032951915032212, 'knowledge'),\n", + " (0.012958700085355939, 'technology')]" ] }, "execution_count": 9, @@ -266,8 +236,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Distribution of Topic 0 0.000060\n", - "Distribution of Topic 1 0.999940\n" + "Distribution of Topic 0 0.562498\n", + "Distribution of Topic 1 0.437502\n" ] } ], @@ -276,7 +246,7 @@ "num_topics = 2\n", "\n", "for i in range(0, num_topics):\n", - " print \"Distribution of Topic %d %f\" % (i, model.gamma_[doc_number, i])" + " print (\"Distribution of Topic %d %f\" % (i, model.gamma_[doc_number, i]))" ] }, { @@ -305,20 +275,7 @@ "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to /tmp/d3d211_train-mult.dat\n", - "INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus\n", - "INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into /tmp/d3d211_train-mult.dat\n", - "INFO:gensim.corpora.bleicorpus:saving vocabulary of 546 words to /tmp/d3d211_train-mult.dat.vocab\n", - "INFO:gensim.models.wrappers.dtmmodel:training DTM with args --ntopics=2 --model=fixed --mode=fit --initialize_lda=true --corpus_prefix=/tmp/d3d211_train --outname=/tmp/d3d211_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6 --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 \n", - "INFO:gensim.models.wrappers.dtmmodel:Running command ['/home/bhargav/dtm/main', '--ntopics=2', '--model=fixed', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/d3d211_train', '--outname=/tmp/d3d211_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--top_chain_var=0.005', '--rng_seed=0']\n" - ] - } - ], + "outputs": [], "source": [ "model = DtmModel(dtm_path, corpus, time_seq, num_topics=2,\n", " id2word=corpus.dictionary, initialize_lda=True, model='fixed')" @@ -341,11 +298,14 @@ }, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.00438000069435\n" - ] + "data": { + "text/plain": [ + "0.0061833357763878861" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -353,7 +313,7 @@ "topic_no = 1 #topic number 2\n", "time_slice = 0 #time slice 1\n", "\n", - "print model.influences_time[time_slice][document_no][topic_no]" + "model.influences_time[time_slice][document_no][topic_no]" ] }, { @@ -383,21 +343,21 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.10" + "pygments_lexer": "ipython3", + "version": "3.5.1" } }, "nbformat": 4, diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 35fe3f4df7..0d029211e5 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -173,9 +173,9 @@ def convert_input(self, corpus, time_slices): corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: - fout.write(six.u(utils.to_utf8(str(len(self.time_slices)) + "\n"))) + fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) for sl in time_slices: - fout.write(six.u(utils.to_utf8(str(sl) + "\n"))) + fout.write(utils.to_utf8(str(sl) + "\n")) def train(self, corpus, time_slices, mode, model): """ @@ -271,18 +271,18 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted for time in chosen_times: for i in chosen_topics: if formatted: - topic = self.print_topic(i, time, topn=topn) + topic = self.print_topic(i, time, num_words=num_words) else: - topic = self.show_topic(i, time, topn=topn) + topic = self.show_topic(i, time, num_words=num_words) shown.append(topic) # if log: # logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i], # topic)) return shown - def show_topic(self, topicid, time, topn=50): + def show_topic(self, topicid, time, num_words=50): """ - Return `topn` most probable words for the given `topicid`, as a list of + Return `num_words` most probable words for the given `topicid`, as a list of `(word_probability, word)` 2-tuples. """ @@ -293,10 +293,10 @@ def show_topic(self, topicid, time, topn=50): # normalize to probability dist topic = topic / topic.sum() # sort according to prob - bestn = matutils.argsort(topic, topn, reverse=True) + bestn = matutils.argsort(topic, num_words, reverse=True) beststr = [(topic[id], self.id2word[id]) for id in bestn] return beststr - def print_topic(self, topicid, time, topn=10): + def print_topic(self, topicid, time, num_words=10): """Return the given topic, formatted as a string.""" - return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)]) + return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, num_words)]) diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py index 36d893cf17..bd99136332 100644 --- a/gensim/test/test_dtm.py +++ b/gensim/test/test_dtm.py @@ -43,7 +43,7 @@ def testDtm(self): topics = model.show_topics(num_topics=2, times=2, num_words=10) self.assertEqual(len(topics), 4) - one_topic = model.show_topic(topicid=1, time=1, topn=10) + one_topic = model.show_topic(topicid=1, time=1, num_words=10) self.assertEqual(len(one_topic), 10) self.assertEqual(one_topic[0][1], u'idexx') @@ -56,7 +56,7 @@ def testDim(self): topics = model.show_topics(num_topics=2, times=2, num_words=10) self.assertEqual(len(topics), 4) - one_topic = model.show_topic(topicid=1, time=1, topn=10) + one_topic = model.show_topic(topicid=1, time=1, num_words=10) self.assertEqual(len(one_topic), 10) self.assertEqual(one_topic[0][1], u'skills')