From c697319b5d6d35646479469354511d9c4dfcace9 Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Wed, 15 Mar 2017 20:47:33 -0700 Subject: [PATCH 1/3] fix for max_iter_dump calculation --- gensim/models/wrappers/wordrank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index ae9746dbf0..be0cb93e87 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -141,7 +141,7 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, output = utils.check_output(args=cmd) # use embeddings from max. iteration's dump - max_iter_dump = iter / dump_period * dump_period - 1 + max_iter_dump = (iter - 1) - (iter - 1) % dump_period copyfile('model_word_%d.txt' % max_iter_dump, 'wordrank.words') copyfile('model_context_%d.txt' % max_iter_dump, 'wordrank.contexts') model = cls.load_wordrank_model('wordrank.words', os.path.join('meta', vocab_file), 'wordrank.contexts', sorted_vocab, ensemble) From 121333ba912fcbea9d44646b74f1690dec627a6b Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Thu, 16 Mar 2017 23:46:14 -0700 Subject: [PATCH 2/3] fix max_iter_dump calculation --- gensim/models/wrappers/wordrank.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index be0cb93e87..fa31d14a30 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -113,6 +113,11 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file, numwords, vocab_file) f.write(meta_info.encode('utf-8')) + + if iter % dump_period == 0: + iter += 1 + else: + logger.warning('Resultant embedding would be from %d iteration', iter - iter % dump_period) wr_args = { 'path': 'meta', @@ -141,7 +146,7 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, output = utils.check_output(args=cmd) # use embeddings from max. iteration's dump - max_iter_dump = (iter - 1) - (iter - 1) % dump_period + max_iter_dump = iter - iter % dump_period copyfile('model_word_%d.txt' % max_iter_dump, 'wordrank.words') copyfile('model_context_%d.txt' % max_iter_dump, 'wordrank.contexts') model = cls.load_wordrank_model('wordrank.words', os.path.join('meta', vocab_file), 'wordrank.contexts', sorted_vocab, ensemble) From 2989d4aae19a451ed239eb68eb1abceb9009e269 Mon Sep 17 00:00:00 2001 From: Ajinkya Kale Date: Fri, 17 Mar 2017 13:58:35 -0700 Subject: [PATCH 3/3] set iter default to 90 --- gensim/models/wrappers/wordrank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index fa31d14a30..b99fe2d272 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -46,7 +46,7 @@ class Wordrank(KeyedVectors): @classmethod def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, - sgd_num=100, lrate=0.001, period=10, iter=91, epsilon=0.75, dump_period=10, reg=0, alpha=100, + sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0): """ `wr_path` is the path to the Wordrank directory.