DTM wrapper bug fixes caused by renaming num_words in #755 (#770)

piskvorky · Jul 4, 2016 · 012877a · 012877a
1 parent 003a886
commit 012877a
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 83 deletions.
diff --git a/docs/notebooks/dtm_example.ipynb b/docs/notebooks/dtm_example.ipynb
@@ -41,15 +41,7 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "DEBUG:root:test\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "logger = logging.getLogger()\n",
     "logger.setLevel(logging.DEBUG)\n",
@@ -110,16 +102,7 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])\n",
-      "INFO:gensim.corpora.dictionary:built Dictionary(546 unique tokens: [u'semantic', u'global', u'dynamic', u'focus', u'executing']...) from 10 documents (total 1112 corpus positions)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "class DTMcorpus(corpora.textcorpus.TextCorpus):\n",
     "\n",
@@ -171,20 +154,7 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to /tmp/fc5bec_train-mult.dat\n",
-      "INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus\n",
-      "INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into /tmp/fc5bec_train-mult.dat\n",
-      "INFO:gensim.corpora.bleicorpus:saving vocabulary of 546 words to /tmp/fc5bec_train-mult.dat.vocab\n",
-      "INFO:gensim.models.wrappers.dtmmodel:training DTM with args --ntopics=2 --model=dtm  --mode=fit --initialize_lda=true --corpus_prefix=/tmp/fc5bec_train --outname=/tmp/fc5bec_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6  --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 \n",
-      "INFO:gensim.models.wrappers.dtmmodel:Running command ['/home/bhargav/dtm/main', '--ntopics=2', '--model=dtm', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/fc5bec_train', '--outname=/tmp/fc5bec_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--top_chain_var=0.005', '--rng_seed=0']\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model = DtmModel(dtm_path, corpus, time_seq, num_topics=2,\n",
     "                 id2word=corpus.dictionary, initialize_lda=True)"
@@ -205,7 +175,7 @@
    },
    "outputs": [],
    "source": [
-    "topics = model.show_topic(topicid=1, time=1, topn=10)"
+    "topics = model.show_topic(topicid=1, time=1, num_words=10)"
    ]
   },
   {
@@ -218,16 +188,16 @@
     {
      "data": {
       "text/plain": [
-       "[(0.036994582002755308, u'skills'),\n",
-       " (0.035650395084747225, u'engineering'),\n",
-       " (0.020960209762361768, u'knowledge'),\n",
-       " (0.017327107555455742, u'electrical'),\n",
-       " (0.017047119686861439, u'technical'),\n",
-       " (0.015645884634432328, u'teams'),\n",
-       " (0.013631466899125954, u'testing'),\n",
-       " (0.012571386996720339, u'complex'),\n",
-       " (0.011803114081816798, u'management'),\n",
-       " (0.010380921159700848, u'customer')]"
+       "[(0.023565028919164586, 'skills'),\n",
+       " (0.02308969736545094, 'engineering'),\n",
+       " (0.019616329462533579, 'idexx'),\n",
+       " (0.0194313503731963, 'testing'),\n",
+       " (0.01858957362093603, 'technical'),\n",
+       " (0.017685337300946517, 'electrical'),\n",
+       " (0.017483543705882995, 'management'),\n",
+       " (0.015310984365058886, 'complex'),\n",
+       " (0.014032951915032212, 'knowledge'),\n",
+       " (0.012958700085355939, 'technology')]"
       ]
      },
      "execution_count": 9,
@@ -266,8 +236,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Distribution of Topic 0 0.000060\n",
-      "Distribution of Topic 1 0.999940\n"
+      "Distribution of Topic 0 0.562498\n",
+      "Distribution of Topic 1 0.437502\n"
      ]
     }
    ],
@@ -276,7 +246,7 @@
     "num_topics = 2\n",
     "\n",
     "for i in range(0, num_topics):\n",
-    "    print \"Distribution of Topic %d %f\" % (i, model.gamma_[doc_number, i])"
+    "    print (\"Distribution of Topic %d %f\" % (i, model.gamma_[doc_number, i]))"
    ]
   },
   {
@@ -305,20 +275,7 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to /tmp/d3d211_train-mult.dat\n",
-      "INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus\n",
-      "INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into /tmp/d3d211_train-mult.dat\n",
-      "INFO:gensim.corpora.bleicorpus:saving vocabulary of 546 words to /tmp/d3d211_train-mult.dat.vocab\n",
-      "INFO:gensim.models.wrappers.dtmmodel:training DTM with args --ntopics=2 --model=fixed  --mode=fit --initialize_lda=true --corpus_prefix=/tmp/d3d211_train --outname=/tmp/d3d211_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6  --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 \n",
-      "INFO:gensim.models.wrappers.dtmmodel:Running command ['/home/bhargav/dtm/main', '--ntopics=2', '--model=fixed', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/d3d211_train', '--outname=/tmp/d3d211_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--top_chain_var=0.005', '--rng_seed=0']\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "model = DtmModel(dtm_path, corpus, time_seq, num_topics=2,\n",
     "                 id2word=corpus.dictionary, initialize_lda=True, model='fixed')"
@@ -341,19 +298,22 @@
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.00438000069435\n"
-     ]
+     "data": {
+      "text/plain": [
+       "0.0061833357763878861"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "document_no = 1 #document 2\n",
     "topic_no = 1 #topic number 2\n",
     "time_slice = 0 #time slice 1\n",
     "\n",
-    "print model.influences_time[time_slice][document_no][topic_no]"
+    "model.influences_time[time_slice][document_no][topic_no]"
    ]
   },
   {
@@ -383,21 +343,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.10"
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
   }
  },
  "nbformat": 4,

diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py
@@ -173,9 +173,9 @@ def convert_input(self, corpus, time_slices):
         corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)
 
         with utils.smart_open(self.ftimeslices(), 'wb') as fout:
-            fout.write(six.u(utils.to_utf8(str(len(self.time_slices)) + "\n")))
+            fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))
             for sl in time_slices:
-                fout.write(six.u(utils.to_utf8(str(sl) + "\n")))
+                fout.write(utils.to_utf8(str(sl) + "\n"))
 
     def train(self, corpus, time_slices, mode, model):
         """
@@ -271,18 +271,18 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted
         for time in chosen_times:
             for i in chosen_topics:
                 if formatted:
-                    topic = self.print_topic(i, time, topn=topn)
+                    topic = self.print_topic(i, time, num_words=num_words)
                 else:
-                    topic = self.show_topic(i, time, topn=topn)
+                    topic = self.show_topic(i, time, num_words=num_words)
                 shown.append(topic)
                 # if log:
                 # logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i],
                 #     topic))
         return shown
 
-    def show_topic(self, topicid, time, topn=50):
+    def show_topic(self, topicid, time, num_words=50):
         """
-        Return `topn` most probable words for the given `topicid`, as a list of
+        Return `num_words` most probable words for the given `topicid`, as a list of
         `(word_probability, word)` 2-tuples.
 
         """
@@ -293,10 +293,10 @@ def show_topic(self, topicid, time, topn=50):
         # normalize to probability dist
         topic = topic / topic.sum()
         # sort according to prob
-        bestn = matutils.argsort(topic, topn, reverse=True)
+        bestn = matutils.argsort(topic, num_words, reverse=True)
         beststr = [(topic[id], self.id2word[id]) for id in bestn]
         return beststr
 
-    def print_topic(self, topicid, time, topn=10):
+    def print_topic(self, topicid, time, num_words=10):
         """Return the given topic, formatted as a string."""
-        return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)])
+        return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, num_words)])
diff --git a/gensim/test/test_dtm.py b/gensim/test/test_dtm.py
@@ -43,7 +43,7 @@ def testDtm(self):
             topics = model.show_topics(num_topics=2, times=2, num_words=10)
             self.assertEqual(len(topics), 4)
 
-            one_topic = model.show_topic(topicid=1, time=1, topn=10)
+            one_topic = model.show_topic(topicid=1, time=1, num_words=10)
             self.assertEqual(len(one_topic), 10)
             self.assertEqual(one_topic[0][1], u'idexx')
 
@@ -56,7 +56,7 @@ def testDim(self):
             topics = model.show_topics(num_topics=2, times=2, num_words=10)
             self.assertEqual(len(topics), 4)
 
-            one_topic = model.show_topic(topicid=1, time=1, topn=10)
+            one_topic = model.show_topic(topicid=1, time=1, num_words=10)
             self.assertEqual(len(one_topic), 10)
             self.assertEqual(one_topic[0][1], u'skills')