From 701d81cd9a36e20ca19131814b238aabbf2afcf7 Mon Sep 17 00:00:00 2001 From: tmylk Date: Tue, 14 Feb 2017 23:14:17 +0000 Subject: [PATCH 1/4] Remove direct access to keyed vectors --- gensim/models/word2vec.py | 106 +------------------------------------- 1 file changed, 1 insertion(+), 105 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 8a62d02588..f7882b1bc1 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -330,9 +330,6 @@ class Word2Vec(utils.SaveLoad): """ - # TODO: delete this flag after direct access to syn0norm, syn0, vocab is removed - keyed_vector_warnings = True - def __init__( self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, @@ -1092,22 +1089,7 @@ def seeded_vector(self, seed_string): once = random.RandomState(self.hashfxn(seed_string) & 0xffffffff) return (once.rand(self.vector_size) - 0.5) / self.vector_size - def save_word2vec_format(self, *args, **kwargs): - if Word2Vec.keyed_vector_warnings: - logger.warning('word2vec.save_word2vec_format will be deprected in future gensim releases. Please use model.wv.save_word2vec_format') - return self.wv.save_word2vec_format(*args, **kwargs) - - @classmethod - def load_word2vec_format(cls, *args, **kwargs): - if Word2Vec.keyed_vector_warnings: - logger.warning('Word2vec.load_word2vec_format will be deprected in future gensim releases. Please use KeyedVectors.load_word2vec_format') - - wv = KeyedVectors.load_word2vec_format(*args, **kwargs) - result = cls(size=wv.syn0.shape[1]) - result.wv = wv - return result - - def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): + def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): """ Merge the input-hidden weight matrix from the original C word2vec-tool format given, where it intersects with the current vocabulary. (No words are added to the @@ -1177,86 +1159,6 @@ def doesnt_match(self, words): def __getitem__(self, words): return self.wv.__getitem__(words) - @staticmethod - def disable_keyed_vectors_warnings(): - Word2Vec.keyed_vector_warnings = False - - @staticmethod - def enable_keyed_vectors_warnings(): - Word2Vec.keyed_vector_warnings = True - - @property - def syn0norm(self): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm') - return self.wv.syn0norm - - @syn0norm.setter - def syn0norm(self, value): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm') - self.wv.syn0norm = value - - @syn0norm.deleter - def syn0norm(self): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm') - del self.wv.syn0norm - - @property - def syn0(self): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0') - return self.wv.syn0 - - @syn0.setter - def syn0(self, value): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0') - self.wv.syn0 = value - - @syn0.deleter - def syn0(self): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0') - del self.wv.syn0 - - @property - def vocab(self): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab') - return self.wv.vocab - - @vocab.setter - def vocab(self, value): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab') - self.wv.vocab = value - - @vocab.deleter - def vocab(self): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab') - del self.wv.vocab - - @property - def index2word(self): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word') - return self.wv.index2word - - @index2word.setter - def index2word(self, value): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word') - self.wv.index2word = value - - @index2word.deleter - def index2word(self): - if Word2Vec.keyed_vector_warnings: - logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word') - del self.wv.index2word - def __contains__(self, word): return self.wv.__contains__(word) @@ -1329,19 +1231,14 @@ def delete_temporary_training_data(self, replace_word_vectors_with_normalized=Fa def save(self, *args, **kwargs): # don't bother storing the cached normalized vectors, recalculable table - # TODO: after introducing KeyedVectors now syn0, vocab, id2word are saved TWO times. Once in word2vec and once in keyedvectors - # After keyedvectors are deprecated it will be only once - Word2Vec.disable_keyed_vectors_warnings() kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table']) super(Word2Vec, self).save(*args, **kwargs) - Word2Vec.enable_keyed_vectors_warnings() save.__doc__ = utils.SaveLoad.save.__doc__ @classmethod def load(cls, *args, **kwargs): - Word2Vec.disable_keyed_vectors_warnings() model = super(Word2Vec, cls).load(*args, **kwargs) # update older models if hasattr(model, 'table'): @@ -1363,7 +1260,6 @@ def load(cls, *args, **kwargs): if not hasattr(model, 'train_count'): model.train_count = 0 model.total_train_time = 0 - Word2Vec.enable_keyed_vectors_warnings() return model def _load_specials(self, *args, **kwargs): From b46b886a76b3c7062d9fda11e5becd26219dfd51 Mon Sep 17 00:00:00 2001 From: tmylk Date: Tue, 14 Feb 2017 23:20:09 +0000 Subject: [PATCH 2/4] Fix typo --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index f7882b1bc1..34ba45bd16 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1089,7 +1089,7 @@ def seeded_vector(self, seed_string): once = random.RandomState(self.hashfxn(seed_string) & 0xffffffff) return (once.rand(self.vector_size) - 0.5) / self.vector_size - def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): + def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): """ Merge the input-hidden weight matrix from the original C word2vec-tool format given, where it intersects with the current vocabulary. (No words are added to the From b6740e3b35d884a4f5d888337a882e8b95680cd3 Mon Sep 17 00:00:00 2001 From: tmylk Date: Wed, 15 Feb 2017 20:12:27 +0000 Subject: [PATCH 3/4] Fix tests --- gensim/similarities/index.py | 2 +- gensim/test/test_doc2vec.py | 4 ++-- gensim/test/test_similarities.py | 6 ++--- gensim/test/test_word2vec.py | 40 ++++++++++++++++---------------- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/gensim/similarities/index.py b/gensim/similarities/index.py index 23da13508a..e227994da0 100644 --- a/gensim/similarities/index.py +++ b/gensim/similarities/index.py @@ -59,7 +59,7 @@ def build_from_word2vec(self): """Build an Annoy index using word vectors from a Word2Vec model""" self.model.init_sims() - return self._build_from_model(self.model.wv.syn0norm, self.model.index2word + return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word , self.model.vector_size) def build_from_doc2vec(self): diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index ad658250f5..1d306bb22d 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -283,7 +283,7 @@ def test_mixed_tag_types(self): def models_equal(self, model, model2): # check words/hidden-weights - self.assertEqual(len(model.vocab), len(model2.vocab)) + self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0)) if model.hs: self.assertTrue(np.allclose(model.syn1, model2.syn1)) @@ -306,7 +306,7 @@ def test_delete_temporary_training_data(self): self.assertTrue(hasattr(model, 'syn0_lockf')) model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False) self.assertTrue(len(model['human']), 10) - self.assertTrue(model.vocab['graph'].count, 5) + self.assertTrue(model.wv.vocab['graph'].count, 5) self.assertTrue(not hasattr(model, 'syn1')) self.assertTrue(not hasattr(model, 'syn1neg')) self.assertTrue(not hasattr(model, 'syn0_lockf')) diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 88addc6d9c..88596bb5b2 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -485,8 +485,8 @@ def testLoadMissingRaisesError(self): self.assertRaises(IOError, test_index.load, fname='test-index') def assertVectorIsSimilarToItself(self, model, index): - vector = model.syn0norm[0] - label = model.index2word[0] + vector = model.wv.syn0norm[0] + label = model.wv.index2word[0] approx_neighbors = index.most_similar(vector, 1) word, similarity = approx_neighbors[0] @@ -494,7 +494,7 @@ def assertVectorIsSimilarToItself(self, model, index): self.assertEqual(similarity, 1.0) def assertApproxNeighborsMatchExact(self, model, index): - vector = model.syn0norm[0] + vector = model.wv.syn0norm[0] approx_neighbors = model.most_similar([vector], topn=5, indexer=index) exact_neighbors = model.most_similar(positive=[vector], topn=5) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 035765e8a0..48dbff87a1 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -210,31 +210,31 @@ def testLoadPreKeyedVectorModel(self): def testLoadPreKeyedVectorModelCFormat(self): """Test loading pre-KeyedVectors word2vec model saved in word2vec format""" - model = word2vec.Word2Vec.load_word2vec_format(datapath('word2vec_pre_kv_c')) + model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c')) self.assertTrue(model.wv.syn0.shape[0] == len(model.wv.vocab)) def testPersistenceWord2VecFormat(self): """Test storing/loading the entire model in word2vec format.""" model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.save_word2vec_format(testfile(), binary=True) - binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) + model.wv.save_word2vec_format(testfile(), binary=True) + binary_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) binary_model.init_sims(replace=False) self.assertTrue(np.allclose(model['human'], binary_model['human'])) - norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) + norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) norm_only_model.init_sims(replace=True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'])) - limited_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, limit=3) + limited_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3) self.assertEquals(len(limited_model.wv.syn0), 3) - half_precision_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, datatype=np.float16) + half_precision_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, datatype=np.float16) self.assertEquals(binary_model.wv.syn0.nbytes, half_precision_model.wv.syn0.nbytes * 2) def testNoTrainingCFormat(self): model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.save_word2vec_format(testfile(), binary=True) - binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True) + model.wv.save_word2vec_format(testfile(), binary=True) + binary_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) self.assertRaises(ValueError, binary_model.train, sentences) @@ -242,31 +242,31 @@ def testTooShortBinaryWord2VecFormat(self): tfile = testfile() model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.save_word2vec_format(tfile, binary=True) + model.wv.save_word2vec_format(tfile, binary=True) f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count f.close() - self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True) + self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=True) def testTooShortTextWord2VecFormat(self): tfile = testfile() model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.save_word2vec_format(tfile, binary=False) + model.wv.save_word2vec_format(tfile, binary=False) f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count f.close() - self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=False) + self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=False) def testPersistenceWord2VecFormatNonBinary(self): """Test storing/loading the entire model in word2vec non-binary format.""" model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() - model.save_word2vec_format(testfile(), binary=False) - text_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False) + model.wv.save_word2vec_format(testfile(), binary=False) + text_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False) text_model.init_sims(False) self.assertTrue(np.allclose(model['human'], text_model['human'], atol=1e-6)) - norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False) + norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False) norm_only_model.init_sims(True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6)) self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4)) @@ -276,8 +276,8 @@ def testPersistenceWord2VecFormatWithVocab(self): model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab') - model.save_word2vec_format(testfile(), testvocab, binary=True) - binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(testfile(), testvocab, binary=True) + model.wv.save_word2vec_format(testfile(), testvocab, binary=True) + binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab.wv.vocab['human'].count) def testPersistenceKeyedVectorsFormatWithVocab(self): @@ -296,8 +296,8 @@ def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab') - model.save_word2vec_format(testfile(), testvocab, binary=True) - binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(testfile(), testvocab, binary=True) + model.wv.save_word2vec_format(testfile(), testvocab, binary=True) + binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) binary_model_with_vocab.save(testfile()) binary_model_with_vocab = word2vec.Word2Vec.load(testfile()) self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab.wv.vocab['human'].count) @@ -416,7 +416,7 @@ def model_sanity(self, model, train=True): orig0 = np.copy(model.wv.syn0[0]) model.train(list_corpus) self.assertFalse((orig0 == model.wv.syn0[1]).all()) # vector should vary after training - sims = model.most_similar('war', topn=len(model.index2word)) + sims = model.most_similar('war', topn=len(model.wv.index2word)) t_rank = [word for word, score in sims].index('terrorism') # in >200 calibration runs w/ calling parameters, 'terrorism' in 50-most_sim for 'war' self.assertLess(t_rank, 50) From 807571f1a31544723e83dd5ec9696d655065eea9 Mon Sep 17 00:00:00 2001 From: tmylk Date: Thu, 16 Feb 2017 18:16:02 +0000 Subject: [PATCH 4/4] Fix persistence tests --- gensim/test/test_word2vec.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 48dbff87a1..462ddd6a88 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -211,30 +211,32 @@ def testLoadPreKeyedVectorModel(self): def testLoadPreKeyedVectorModelCFormat(self): """Test loading pre-KeyedVectors word2vec model saved in word2vec format""" model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c')) - self.assertTrue(model.wv.syn0.shape[0] == len(model.wv.vocab)) + self.assertTrue(model.syn0.shape[0] == len(model.vocab)) def testPersistenceWord2VecFormat(self): """Test storing/loading the entire model in word2vec format.""" model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(testfile(), binary=True) - binary_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) - binary_model.init_sims(replace=False) - self.assertTrue(np.allclose(model['human'], binary_model['human'])) + binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) + binary_model_kv.init_sims(replace=False) + self.assertTrue(np.allclose(model['human'], binary_model_kv['human'])) norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) norm_only_model.init_sims(replace=True) self.assertFalse(np.allclose(model['human'], norm_only_model['human'])) self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'])) - limited_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3) - self.assertEquals(len(limited_model.wv.syn0), 3) - half_precision_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, datatype=np.float16) - self.assertEquals(binary_model.wv.syn0.nbytes, half_precision_model.wv.syn0.nbytes * 2) + limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3) + self.assertEquals(len(limited_model_kv.syn0), 3) + half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, datatype=np.float16) + self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2) def testNoTrainingCFormat(self): model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(testfile(), binary=True) - binary_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) + kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True) + binary_model = word2vec.Word2Vec() + binary_model.wv = kv self.assertRaises(ValueError, binary_model.train, sentences) @@ -277,8 +279,8 @@ def testPersistenceWord2VecFormatWithVocab(self): model.init_sims() testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab') model.wv.save_word2vec_format(testfile(), testvocab, binary=True) - binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) - self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab.wv.vocab['human'].count) + binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) + self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab_kv.vocab['human'].count) def testPersistenceKeyedVectorsFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" @@ -292,15 +294,15 @@ def testPersistenceKeyedVectorsFormatWithVocab(self): def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): """Test storing/loading the entire model and vocabulary in word2vec format chained with - saving and loading via `save` and `load` methods`.""" + saving and loading via `save` and `load` methods`. + It was possible prior to 1.0.0 release, now raises Exception""" model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab') model.wv.save_word2vec_format(testfile(), testvocab, binary=True) - binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) - binary_model_with_vocab.save(testfile()) - binary_model_with_vocab = word2vec.Word2Vec.load(testfile()) - self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab.wv.vocab['human'].count) + binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True) + binary_model_with_vocab_kv.save(testfile()) + self.assertRaises(AttributeError, word2vec.Word2Vec.load, testfile()) def testLargeMmap(self):