Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove direct access to properties moved to KeyedVectors #1147

Merged
merged 5 commits into from
Feb 16, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 0 additions & 104 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,6 @@ class Word2Vec(utils.SaveLoad):

"""

# TODO: delete this flag after direct access to syn0norm, syn0, vocab is removed
keyed_vector_warnings = True

def __init__(
self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
Expand Down Expand Up @@ -1092,21 +1089,6 @@ def seeded_vector(self, seed_string):
once = random.RandomState(self.hashfxn(seed_string) & 0xffffffff)
return (once.rand(self.vector_size) - 0.5) / self.vector_size

def save_word2vec_format(self, *args, **kwargs):
if Word2Vec.keyed_vector_warnings:
logger.warning('word2vec.save_word2vec_format will be deprected in future gensim releases. Please use model.wv.save_word2vec_format')
return self.wv.save_word2vec_format(*args, **kwargs)

@classmethod
def load_word2vec_format(cls, *args, **kwargs):
if Word2Vec.keyed_vector_warnings:
logger.warning('Word2vec.load_word2vec_format will be deprected in future gensim releases. Please use KeyedVectors.load_word2vec_format')

wv = KeyedVectors.load_word2vec_format(*args, **kwargs)
result = cls(size=wv.syn0.shape[1])
result.wv = wv
return result

def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'):
"""
Merge the input-hidden weight matrix from the original C word2vec-tool format
Expand Down Expand Up @@ -1177,86 +1159,6 @@ def doesnt_match(self, words):
def __getitem__(self, words):
return self.wv.__getitem__(words)

@staticmethod
def disable_keyed_vectors_warnings():
Word2Vec.keyed_vector_warnings = False

@staticmethod
def enable_keyed_vectors_warnings():
Word2Vec.keyed_vector_warnings = True

@property
def syn0norm(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm')
return self.wv.syn0norm

@syn0norm.setter
def syn0norm(self, value):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm')
self.wv.syn0norm = value

@syn0norm.deleter
def syn0norm(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0norm will not be supported in future gensim releases, please use model.wv.syn0norm')
del self.wv.syn0norm

@property
def syn0(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0')
return self.wv.syn0

@syn0.setter
def syn0(self, value):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0')
self.wv.syn0 = value

@syn0.deleter
def syn0(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to syn0 will not be supported in future gensim releases, please use model.wv.syn0')
del self.wv.syn0

@property
def vocab(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab')
return self.wv.vocab

@vocab.setter
def vocab(self, value):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab')
self.wv.vocab = value

@vocab.deleter
def vocab(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to vocab will not be supported in future gensim releases, please use model.wv.vocab')
del self.wv.vocab

@property
def index2word(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word')
return self.wv.index2word

@index2word.setter
def index2word(self, value):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word')
self.wv.index2word = value

@index2word.deleter
def index2word(self):
if Word2Vec.keyed_vector_warnings:
logger.warning('direct access to index2word will not be supported in future gensim releases, please use model.wv.index2word')
del self.wv.index2word

def __contains__(self, word):
return self.wv.__contains__(word)

Expand Down Expand Up @@ -1329,19 +1231,14 @@ def delete_temporary_training_data(self, replace_word_vectors_with_normalized=Fa

def save(self, *args, **kwargs):
# don't bother storing the cached normalized vectors, recalculable table
# TODO: after introducing KeyedVectors now syn0, vocab, id2word are saved TWO times. Once in word2vec and once in keyedvectors
# After keyedvectors are deprecated it will be only once
Word2Vec.disable_keyed_vectors_warnings()
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table'])

super(Word2Vec, self).save(*args, **kwargs)
Word2Vec.enable_keyed_vectors_warnings()

save.__doc__ = utils.SaveLoad.save.__doc__

@classmethod
def load(cls, *args, **kwargs):
Word2Vec.disable_keyed_vectors_warnings()
model = super(Word2Vec, cls).load(*args, **kwargs)
# update older models
if hasattr(model, 'table'):
Expand All @@ -1363,7 +1260,6 @@ def load(cls, *args, **kwargs):
if not hasattr(model, 'train_count'):
model.train_count = 0
model.total_train_time = 0
Word2Vec.enable_keyed_vectors_warnings()
return model

def _load_specials(self, *args, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion gensim/similarities/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def build_from_word2vec(self):
"""Build an Annoy index using word vectors from a Word2Vec model"""

self.model.init_sims()
return self._build_from_model(self.model.wv.syn0norm, self.model.index2word
return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word
, self.model.vector_size)

def build_from_doc2vec(self):
Expand Down
4 changes: 2 additions & 2 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def test_mixed_tag_types(self):

def models_equal(self, model, model2):
# check words/hidden-weights
self.assertEqual(len(model.vocab), len(model2.vocab))
self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab))
self.assertTrue(np.allclose(model.wv.syn0, model2.wv.syn0))
if model.hs:
self.assertTrue(np.allclose(model.syn1, model2.syn1))
Expand All @@ -306,7 +306,7 @@ def test_delete_temporary_training_data(self):
self.assertTrue(hasattr(model, 'syn0_lockf'))
model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=False)
self.assertTrue(len(model['human']), 10)
self.assertTrue(model.vocab['graph'].count, 5)
self.assertTrue(model.wv.vocab['graph'].count, 5)
self.assertTrue(not hasattr(model, 'syn1'))
self.assertTrue(not hasattr(model, 'syn1neg'))
self.assertTrue(not hasattr(model, 'syn0_lockf'))
Expand Down
6 changes: 3 additions & 3 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,16 +485,16 @@ def testLoadMissingRaisesError(self):
self.assertRaises(IOError, test_index.load, fname='test-index')

def assertVectorIsSimilarToItself(self, model, index):
vector = model.syn0norm[0]
label = model.index2word[0]
vector = model.wv.syn0norm[0]
label = model.wv.index2word[0]
approx_neighbors = index.most_similar(vector, 1)
word, similarity = approx_neighbors[0]

self.assertEqual(word, label)
self.assertEqual(similarity, 1.0)

def assertApproxNeighborsMatchExact(self, model, index):
vector = model.syn0norm[0]
vector = model.wv.syn0norm[0]
approx_neighbors = model.most_similar([vector], topn=5, indexer=index)
exact_neighbors = model.most_similar(positive=[vector], topn=5)

Expand Down
62 changes: 32 additions & 30 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,63 +210,65 @@ def testLoadPreKeyedVectorModel(self):

def testLoadPreKeyedVectorModelCFormat(self):
"""Test loading pre-KeyedVectors word2vec model saved in word2vec format"""
model = word2vec.Word2Vec.load_word2vec_format(datapath('word2vec_pre_kv_c'))
self.assertTrue(model.wv.syn0.shape[0] == len(model.wv.vocab))
model = keyedvectors.KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'))
self.assertTrue(model.syn0.shape[0] == len(model.vocab))

def testPersistenceWord2VecFormat(self):
"""Test storing/loading the entire model in word2vec format."""
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=True)
binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
binary_model.init_sims(replace=False)
self.assertTrue(np.allclose(model['human'], binary_model['human']))
norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
model.wv.save_word2vec_format(testfile(), binary=True)
binary_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
binary_model_kv.init_sims(replace=False)
self.assertTrue(np.allclose(model['human'], binary_model_kv['human']))
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
norm_only_model.init_sims(replace=True)
self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human']))
limited_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, limit=3)
self.assertEquals(len(limited_model.wv.syn0), 3)
half_precision_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, datatype=np.float16)
self.assertEquals(binary_model.wv.syn0.nbytes, half_precision_model.wv.syn0.nbytes * 2)
limited_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, limit=3)
self.assertEquals(len(limited_model_kv.syn0), 3)
half_precision_model_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True, datatype=np.float16)
self.assertEquals(binary_model_kv.syn0.nbytes, half_precision_model_kv.syn0.nbytes * 2)

def testNoTrainingCFormat(self):
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=True)
binary_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True)
model.wv.save_word2vec_format(testfile(), binary=True)
kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=True)
binary_model = word2vec.Word2Vec()
binary_model.wv = kv
self.assertRaises(ValueError, binary_model.train, sentences)


def testTooShortBinaryWord2VecFormat(self):
tfile = testfile()
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(tfile, binary=True)
model.wv.save_word2vec_format(tfile, binary=True)
f = open(tfile, 'r+b')
f.write(b'13') # write wrong (too-long) vector count
f.close()
self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True)
self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=True)

def testTooShortTextWord2VecFormat(self):
tfile = testfile()
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(tfile, binary=False)
model.wv.save_word2vec_format(tfile, binary=False)
f = open(tfile, 'r+b')
f.write(b'13') # write wrong (too-long) vector count
f.close()
self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=False)
self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=False)

def testPersistenceWord2VecFormatNonBinary(self):
"""Test storing/loading the entire model in word2vec non-binary format."""
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(testfile(), binary=False)
text_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False)
model.wv.save_word2vec_format(testfile(), binary=False)
text_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False)
text_model.init_sims(False)
self.assertTrue(np.allclose(model['human'], text_model['human'], atol=1e-6))
norm_only_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=False)
norm_only_model = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), binary=False)
norm_only_model.init_sims(True)
self.assertFalse(np.allclose(model['human'], norm_only_model['human'], atol=1e-6))
self.assertTrue(np.allclose(model.wv.syn0norm[model.wv.vocab['human'].index], norm_only_model['human'], atol=1e-4))
Expand All @@ -276,9 +278,9 @@ def testPersistenceWord2VecFormatWithVocab(self):
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab')
model.save_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(testfile(), testvocab, binary=True)
self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab.wv.vocab['human'].count)
model.wv.save_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True)
self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab_kv.vocab['human'].count)

def testPersistenceKeyedVectorsFormatWithVocab(self):
"""Test storing/loading the entire model and vocabulary in word2vec format."""
Expand All @@ -292,15 +294,15 @@ def testPersistenceKeyedVectorsFormatWithVocab(self):

def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self):
"""Test storing/loading the entire model and vocabulary in word2vec format chained with
saving and loading via `save` and `load` methods`."""
saving and loading via `save` and `load` methods`.
It was possible prior to 1.0.0 release, now raises Exception"""
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
testvocab = os.path.join(tempfile.gettempdir(), 'gensim_word2vec.vocab')
model.save_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab = word2vec.Word2Vec.load_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab.save(testfile())
binary_model_with_vocab = word2vec.Word2Vec.load(testfile())
self.assertEqual(model.wv.vocab['human'].count, binary_model_with_vocab.wv.vocab['human'].count)
model.wv.save_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(testfile(), testvocab, binary=True)
binary_model_with_vocab_kv.save(testfile())
self.assertRaises(AttributeError, word2vec.Word2Vec.load, testfile())


def testLargeMmap(self):
Expand Down Expand Up @@ -416,7 +418,7 @@ def model_sanity(self, model, train=True):
orig0 = np.copy(model.wv.syn0[0])
model.train(list_corpus)
self.assertFalse((orig0 == model.wv.syn0[1]).all()) # vector should vary after training
sims = model.most_similar('war', topn=len(model.index2word))
sims = model.most_similar('war', topn=len(model.wv.index2word))
t_rank = [word for word, score in sims].index('terrorism')
# in >200 calibration runs w/ calling parameters, 'terrorism' in 50-most_sim for 'war'
self.assertLess(t_rank, 50)
Expand Down