-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_w2v.py
32 lines (25 loc) · 921 Bytes
/
load_w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""
Save the word2vec vectors of the all the words in vocabulary.
If the word is not in word2vec, use the average embedding.
"""
import gensim
import numpy as np
import h5py
# Use gesim package to load word2vec
model = gensim.models.Word2Vec.load_word2vec_format('../../model/GoogleNews-vectors-negative300.bin.gz', binary=True)
# Load the vocabulary, add end of sentence and a avgvec (average vector) to the vocabulary.
vocab = open('data/vocabulary.txt','r').readlines() + ['</s>', 'avgvec']
# Get
avg = np.mean(model.syn0, 0)
result = np.zeros((len(vocab), 300))
for i in range(len(vocab)):
if vocab[i].strip() in model: # the word is in word2vec
result[i,:] = model[vocab[i].strip()]
else: # not in word2vec
print vocab[i].strip()
result[i, :] = avg
# Save the matrix into a h5 file
filename = 'data/w2v.h5'
f = h5py.File(filename, 'w')
f.create_dataset("w2v", dtype='float32', data=result)
f.close()