-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Wrapper for Varembed Models #1067
Changes from 17 commits
829c683
f62e41a
88e1324
c9abe31
66365f6
a488723
345e184
9903a02
cc4a549
1be271e
4f11aeb
77db09b
6034e68
cadccd4
5777fe7
bf57058
3095620
dbb969b
74e3b12
53889e5
6e7f681
cee0410
3d0c74d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
:mod:`models.wrappers.varembed` -- VarEmbed Word Embeddings | ||
================================================================================================ | ||
|
||
.. automodule:: gensim.models.wrappers.varembed | ||
:synopsis: VarEmbed Word Embeddings | ||
:members: | ||
:inherited-members: | ||
:undoc-members: | ||
:show-inheritance: |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
# Copyright (C) 2017 Anmol Gulati <[email protected]> | ||
# Copyright (C) 2017 Radim Rehurek <[email protected]> | ||
|
||
""" | ||
Python wrapper around word representation learning from Varembed models, a library for efficient learning of word representations and sentence classification [1]. | ||
|
||
This module allows ability to obtain word vectors for out-of-vocabulary words, for the Varembed model[2]. | ||
|
||
The wrapped model can NOT be updated with new documents for online training -- use gensim's `Word2Vec` for that. | ||
|
||
.. [1] https://github.com/rguthrie3/MorphologicalPriorsForWordEmbeddings | ||
|
||
.. [2] http://arxiv.org/pdf/1608.01056.pdf | ||
""" | ||
|
||
import logging | ||
import sys | ||
|
||
import numpy as np | ||
|
||
from gensim.models.keyedvectors import KeyedVectors | ||
|
||
# utility fnc for pickling, common scipy operations etc | ||
from gensim import utils | ||
from gensim.models.word2vec import Vocab | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class VarEmbed(KeyedVectors): | ||
""" | ||
Class for word vectors using Varembed models. Contains methods to load a varembed model and implements | ||
functionality like `most_similar`, `similarity` by extracting vectors into numpy matrix. | ||
Refer to [Varembed]https://github.com/rguthrie3/MorphologicalPriorsForWordEmbeddings for | ||
implementation of Varembed models. | ||
""" | ||
|
||
def __init__(self): | ||
self.vector_size = 0 | ||
self.vocab_size = 0 | ||
|
||
@classmethod | ||
def load_varembed_format(cls, vectors, morfessor_model=None, use_morphemes=False): | ||
""" | ||
Load the word vectors into matrix from the varembed output vector files. | ||
Using morphemes requires Python 2.7 version or above. | ||
|
||
'vectors' is the pickle file containing the word vectors. | ||
'morfessor_model' is the path to the trained morfessor model. | ||
'use_morphemes' False(default) use of morpheme embeddings in output. | ||
""" | ||
result = cls() | ||
if vectors is None: | ||
raise Exception( | ||
"Please provide vectors binary to load varembed model") | ||
D = utils.unpickle(vectors) | ||
word_to_ix = D['word_to_ix'] | ||
morpho_to_ix = D['morpho_to_ix'] | ||
word_embeddings = D['word_embeddings'] | ||
morpho_embeddings = D['morpheme_embeddings'] | ||
result.load_word_embeddings(word_embeddings, word_to_ix) | ||
if use_morphemes: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, sounds good. Done. |
||
if sys.version_info >= (2, 7): #Morfessor is only supported for Python 2.7 and above. | ||
try: | ||
import morfessor | ||
morfessor_model = morfessor.MorfessorIO().read_binary_model_file(morfessor_model) | ||
result.ensemble_morpheme_embeddings(morfessor_model, morpho_embeddings, morpho_to_ix) | ||
except ImportError: | ||
# Morfessor Package not found. | ||
logger.error('Could not import morfessor. Not using morpheme embeddings') | ||
raise ImportError('Could not import morfessor.') | ||
else: | ||
# Raise exception in Python 2.6 or earlier. | ||
raise Exception('Using Morphemes requires Python 2.7 and above. ' | ||
'Morfessor is not supported in python 2.6') | ||
|
||
logger.info('Loaded varembed model vectors from %s', vectors) | ||
return result | ||
|
||
def load_word_embeddings(self, word_embeddings, word_to_ix): | ||
""" Loads the word embeddings """ | ||
logger.info("Loading the vocabulary") | ||
self.vocab = {} | ||
self.index2word = [] | ||
counts = {} | ||
for word in word_to_ix: | ||
counts[word] = counts.get(word, 0) + 1 | ||
self.vocab_size = len(counts) | ||
self.vector_size = word_embeddings.shape[1] | ||
self.syn0 = np.zeros((self.vocab_size, self.vector_size)) | ||
self.index2word = [None]*self.vocab_size | ||
logger.info("Corpus has %i words", len(self.vocab)) | ||
for word_id, word in enumerate(counts): | ||
self.vocab[word] = Vocab(index=word_id, count=counts[word]) | ||
self.syn0[word_id] = word_embeddings[word_to_ix[word]] | ||
self.index2word[word_id] = word | ||
assert((len(self.vocab), self.vector_size) == self.syn0.shape) | ||
logger.info("Loaded matrix of %d size and %d dimensions", self.vocab_size, self.vector_size) | ||
|
||
|
||
def ensemble_morpheme_embeddings(self, morfessor_model, morpho_embeddings, morpho_to_ix): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed method name now. |
||
""" Method to include morpheme embeddings into varembed vectors | ||
Allowed only in Python versions 2.7 and above. | ||
""" | ||
for word in self.vocab: | ||
morpheme_embedding = np.array( | ||
[morpho_embeddings[morpho_to_ix.get(m, -1)] for m in | ||
morfessor_model.viterbi_segment(word)[0]]).sum(axis=0) | ||
self.syn0[self.vocab[word].index] += morpheme_embedding |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You mean that VarEmbed gensim wrapper doesn't support it? Also, someone might be confused that you are suggesting to load varembed and then train it as Word2Vec on new words which is incorrect
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh yes, you are right. It could be bit confusing earlier. Updated it now.