Skip to content

Commit

Permalink
DTM wrapper bug fixes caused by renaming num_words in #755 (#770)
Browse files Browse the repository at this point in the history
  • Loading branch information
bhargavvader authored and tmylk committed Jul 4, 2016
1 parent 003a886 commit 012877a
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 83 deletions.
104 changes: 32 additions & 72 deletions docs/notebooks/dtm_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,7 @@
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"DEBUG:root:test\n"
]
}
],
"outputs": [],
"source": [
"logger = logging.getLogger()\n",
"logger.setLevel(logging.DEBUG)\n",
Expand Down Expand Up @@ -110,16 +102,7 @@
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])\n",
"INFO:gensim.corpora.dictionary:built Dictionary(546 unique tokens: [u'semantic', u'global', u'dynamic', u'focus', u'executing']...) from 10 documents (total 1112 corpus positions)\n"
]
}
],
"outputs": [],
"source": [
"class DTMcorpus(corpora.textcorpus.TextCorpus):\n",
"\n",
Expand Down Expand Up @@ -171,20 +154,7 @@
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to /tmp/fc5bec_train-mult.dat\n",
"INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus\n",
"INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into /tmp/fc5bec_train-mult.dat\n",
"INFO:gensim.corpora.bleicorpus:saving vocabulary of 546 words to /tmp/fc5bec_train-mult.dat.vocab\n",
"INFO:gensim.models.wrappers.dtmmodel:training DTM with args --ntopics=2 --model=dtm --mode=fit --initialize_lda=true --corpus_prefix=/tmp/fc5bec_train --outname=/tmp/fc5bec_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6 --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 \n",
"INFO:gensim.models.wrappers.dtmmodel:Running command ['/home/bhargav/dtm/main', '--ntopics=2', '--model=dtm', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/fc5bec_train', '--outname=/tmp/fc5bec_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--top_chain_var=0.005', '--rng_seed=0']\n"
]
}
],
"outputs": [],
"source": [
"model = DtmModel(dtm_path, corpus, time_seq, num_topics=2,\n",
" id2word=corpus.dictionary, initialize_lda=True)"
Expand All @@ -205,7 +175,7 @@
},
"outputs": [],
"source": [
"topics = model.show_topic(topicid=1, time=1, topn=10)"
"topics = model.show_topic(topicid=1, time=1, num_words=10)"
]
},
{
Expand All @@ -218,16 +188,16 @@
{
"data": {
"text/plain": [
"[(0.036994582002755308, u'skills'),\n",
" (0.035650395084747225, u'engineering'),\n",
" (0.020960209762361768, u'knowledge'),\n",
" (0.017327107555455742, u'electrical'),\n",
" (0.017047119686861439, u'technical'),\n",
" (0.015645884634432328, u'teams'),\n",
" (0.013631466899125954, u'testing'),\n",
" (0.012571386996720339, u'complex'),\n",
" (0.011803114081816798, u'management'),\n",
" (0.010380921159700848, u'customer')]"
"[(0.023565028919164586, 'skills'),\n",
" (0.02308969736545094, 'engineering'),\n",
" (0.019616329462533579, 'idexx'),\n",
" (0.0194313503731963, 'testing'),\n",
" (0.01858957362093603, 'technical'),\n",
" (0.017685337300946517, 'electrical'),\n",
" (0.017483543705882995, 'management'),\n",
" (0.015310984365058886, 'complex'),\n",
" (0.014032951915032212, 'knowledge'),\n",
" (0.012958700085355939, 'technology')]"
]
},
"execution_count": 9,
Expand Down Expand Up @@ -266,8 +236,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Distribution of Topic 0 0.000060\n",
"Distribution of Topic 1 0.999940\n"
"Distribution of Topic 0 0.562498\n",
"Distribution of Topic 1 0.437502\n"
]
}
],
Expand All @@ -276,7 +246,7 @@
"num_topics = 2\n",
"\n",
"for i in range(0, num_topics):\n",
" print \"Distribution of Topic %d %f\" % (i, model.gamma_[doc_number, i])"
" print (\"Distribution of Topic %d %f\" % (i, model.gamma_[doc_number, i]))"
]
},
{
Expand Down Expand Up @@ -305,20 +275,7 @@
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:gensim.models.wrappers.dtmmodel:serializing temporary corpus to /tmp/d3d211_train-mult.dat\n",
"INFO:gensim.corpora.bleicorpus:no word id mapping provided; initializing from corpus\n",
"INFO:gensim.corpora.bleicorpus:storing corpus in Blei's LDA-C format into /tmp/d3d211_train-mult.dat\n",
"INFO:gensim.corpora.bleicorpus:saving vocabulary of 546 words to /tmp/d3d211_train-mult.dat.vocab\n",
"INFO:gensim.models.wrappers.dtmmodel:training DTM with args --ntopics=2 --model=fixed --mode=fit --initialize_lda=true --corpus_prefix=/tmp/d3d211_train --outname=/tmp/d3d211_train_out --alpha=0.01 --lda_max_em_iter=10 --lda_sequence_min_iter=6 --lda_sequence_max_iter=20 --top_chain_var=0.005 --rng_seed=0 \n",
"INFO:gensim.models.wrappers.dtmmodel:Running command ['/home/bhargav/dtm/main', '--ntopics=2', '--model=fixed', '--mode=fit', '--initialize_lda=true', '--corpus_prefix=/tmp/d3d211_train', '--outname=/tmp/d3d211_train_out', '--alpha=0.01', '--lda_max_em_iter=10', '--lda_sequence_min_iter=6', '--lda_sequence_max_iter=20', '--top_chain_var=0.005', '--rng_seed=0']\n"
]
}
],
"outputs": [],
"source": [
"model = DtmModel(dtm_path, corpus, time_seq, num_topics=2,\n",
" id2word=corpus.dictionary, initialize_lda=True, model='fixed')"
Expand All @@ -341,19 +298,22 @@
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00438000069435\n"
]
"data": {
"text/plain": [
"0.0061833357763878861"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"document_no = 1 #document 2\n",
"topic_no = 1 #topic number 2\n",
"time_slice = 0 #time slice 1\n",
"\n",
"print model.influences_time[time_slice][document_no][topic_no]"
"model.influences_time[time_slice][document_no][topic_no]"
]
},
{
Expand Down Expand Up @@ -383,21 +343,21 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"display_name": "Python 3",
"language": "python",
"name": "python2"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
Expand Down
18 changes: 9 additions & 9 deletions gensim/models/wrappers/dtmmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,9 @@ def convert_input(self, corpus, time_slices):
corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)

with utils.smart_open(self.ftimeslices(), 'wb') as fout:
fout.write(six.u(utils.to_utf8(str(len(self.time_slices)) + "\n")))
fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))
for sl in time_slices:
fout.write(six.u(utils.to_utf8(str(sl) + "\n")))
fout.write(utils.to_utf8(str(sl) + "\n"))

def train(self, corpus, time_slices, mode, model):
"""
Expand Down Expand Up @@ -271,18 +271,18 @@ def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted
for time in chosen_times:
for i in chosen_topics:
if formatted:
topic = self.print_topic(i, time, topn=topn)
topic = self.print_topic(i, time, num_words=num_words)
else:
topic = self.show_topic(i, time, topn=topn)
topic = self.show_topic(i, time, num_words=num_words)
shown.append(topic)
# if log:
# logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i],
# topic))
return shown

def show_topic(self, topicid, time, topn=50):
def show_topic(self, topicid, time, num_words=50):
"""
Return `topn` most probable words for the given `topicid`, as a list of
Return `num_words` most probable words for the given `topicid`, as a list of
`(word_probability, word)` 2-tuples.
"""
Expand All @@ -293,10 +293,10 @@ def show_topic(self, topicid, time, topn=50):
# normalize to probability dist
topic = topic / topic.sum()
# sort according to prob
bestn = matutils.argsort(topic, topn, reverse=True)
bestn = matutils.argsort(topic, num_words, reverse=True)
beststr = [(topic[id], self.id2word[id]) for id in bestn]
return beststr

def print_topic(self, topicid, time, topn=10):
def print_topic(self, topicid, time, num_words=10):
"""Return the given topic, formatted as a string."""
return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)])
return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, num_words)])
4 changes: 2 additions & 2 deletions gensim/test/test_dtm.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def testDtm(self):
topics = model.show_topics(num_topics=2, times=2, num_words=10)
self.assertEqual(len(topics), 4)

one_topic = model.show_topic(topicid=1, time=1, topn=10)
one_topic = model.show_topic(topicid=1, time=1, num_words=10)
self.assertEqual(len(one_topic), 10)
self.assertEqual(one_topic[0][1], u'idexx')

Expand All @@ -56,7 +56,7 @@ def testDim(self):
topics = model.show_topics(num_topics=2, times=2, num_words=10)
self.assertEqual(len(topics), 4)

one_topic = model.show_topic(topicid=1, time=1, topn=10)
one_topic = model.show_topic(topicid=1, time=1, num_words=10)
self.assertEqual(len(one_topic), 10)
self.assertEqual(one_topic[0][1], u'skills')

Expand Down

0 comments on commit 012877a

Please sign in to comment.