Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Embeddings from Language Models (ELMo) #1

Merged
merged 2 commits into from
May 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion gluonnlp/data/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@
# pylint: disable=
"""Language model datasets."""

__all__ = ['WikiText2', 'WikiText103']
__all__ = ['WikiText2', 'WikiText103', 'WikiText2Character']

import os
import zipfile
import shutil

from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
import mxnet as mx

from .. import _constants as C
from .dataset import LanguageModelDataset
Expand Down Expand Up @@ -99,6 +100,53 @@ def __init__(self, segment='train', skip_empty=True, bos=None, eos=C.EOS_TOKEN,
'c7b8ce0aa086fb34dab808c5c49224211eb2b172')}
super(WikiText2, self).__init__('wikitext-2', segment, bos, eos, skip_empty, root)

@register(segment=['train', 'val', 'test'])
class WikiText2Character(WikiText2):
"""WikiText-2 word-level dataset for language modeling, from Salesforce research.

From
https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset

License: Creative Commons Attribution-ShareAlike

Parameters
----------
segment : str, default 'train'
Dataset segment. Options are 'train', 'val', 'test'.
skip_empty : bool, default True
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
bos : str or None, default None
The token to add at the begining of each sentence. If None, nothing is added.
eos : str or None, default '<eos>'
The token to add at the end of each sentence. If None, nothing is added.
root : str, default '~/.mxnet/datasets/wikitext-2'
Path to temp folder for storing data.
"""
def __init__(self, segment='train', skip_empty=True, bos='<bos>', eos='<eos>',
root=os.path.join('~', '.mxnet', 'datasets', 'wikitext-2')):
super(WikiText2Character, self).__init__(segment, skip_empty, bos, eos, root)

def batchify(self, vocab, batch_size, max_word_length=50, load=None):
"""Transform the dataset into N independent sequences, where N is the batch size.

Parameters
----------
vocab : gluonnlp.Vocab
The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
index according to the vocabulary.
batch_size : int
The number of samples in each batch.

Returns
-------
NDArray of shape (num_tokens // N, N). Excessive tokens that don't align along
the batches are discarded.
"""
data = self._data[0]
sample_len = len(data) // batch_size
return vocab.dataset_to_char_ids(data[:sample_len*batch_size], batch_size, sample_len, max_word_length).swapaxes(0, 1), mx.nd.array(vocab[data[:sample_len*batch_size]]).reshape(batch_size, -1).T


@register(segment=['train', 'val', 'test'])
class WikiText103(_WikiText):
Expand Down
156 changes: 155 additions & 1 deletion gluonnlp/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@
from __future__ import absolute_import
from __future__ import print_function

__all__ = ['Vocab']
__all__ = ['Vocab', 'UnicodeCharsVocabulary']

import json
import warnings

from mxnet import nd
import numpy as np

from .data.utils import DefaultLookupDict
from . import _constants as C
Expand Down Expand Up @@ -445,3 +446,156 @@ def from_json(json_str):
vocab._bos_token = vocab_dict.get('bos_token')
vocab._eos_token = vocab_dict.get('eos_token')
return vocab

class UnicodeCharsVocabulary(Vocab):
"""Vocabulary containing character-level and word level information.

Has a word vocabulary that is used to lookup word ids and
a character id that is used to map words to arrays of character ids.

The character ids are defined by ord(c) for c in word.encode('utf-8')
This limits the total number of possible char ids to 256.
To this we add 5 additional special ids: begin sentence, end sentence,
begin word, end word and padding.
"""
def __init__(self, counter=None, max_word_length=50, max_size=None, min_freq=1, unknown_token='<unk>',
padding_token='<pad>', bos_token='<bos>', eos_token='<eos>', reserved_tokens=None):
super(UnicodeCharsVocabulary, self).__init__(counter, max_size, min_freq, unknown_token, padding_token,
bos_token, eos_token, reserved_tokens)
self._max_word_length = max_word_length

# char ids 0-255 come from utf-8 encoding bytes
# assign 256-300 to special chars
self.bos_char = 256 # <begin sentence>
self.eos_char = 257 # <end sentence>
self.bow_char = 258 # <begin word>
self.eow_char = 259 # <end word>
self.pad_char = 260 # <padding>

if counter:
self.num_words = self.__len__()

self._word_char_ids = np.zeros([self.num_words, max_word_length],
dtype=np.int32)

# the charcter representation of the begin/end of sentence characters
def _make_bos_eos(c):
r = np.zeros([self.max_word_length], dtype=np.int32)
r[:] = self.pad_char
r[0] = self.bow_char
r[1] = c
r[2] = self.eow_char
return r
self.bos_chars = _make_bos_eos(self.bos_char)
self.eos_chars = _make_bos_eos(self.eos_char)

for i, word in enumerate(self._token_to_idx):
self._word_char_ids[i] = self._convert_word_to_char_ids(word)

self._word_char_ids[self._token_to_idx[self.bos_token]] = self.bos_chars
self._word_char_ids[self._token_to_idx[self.eos_token]] = self.eos_chars

@property
def word_char_ids(self):
return self._word_char_ids

@property
def size(self):
return self.num_words

@property
def max_word_length(self):
return self._max_word_length

def _convert_word_to_char_ids(self, word):
code = np.zeros([self.max_word_length], dtype=np.int32)
code[:] = self.pad_char

word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)]
code[0] = self.bow_char
for k, chr_id in enumerate(word_encoded, start=1):
code[k] = chr_id
code[k + 1] = self.eow_char

return code

def word_to_char_ids(self, word):
if word in self._token_to_idx:
return self._word_char_ids[self._token_to_idx[word]]
else:
return self._convert_word_to_char_ids(word)

def array_to_char_ids(self, input_array, max_word_length):
char_array = nd.full((input_array.shape[0], input_array.shape[1], max_word_length), self.pad_char)

for i in range(input_array.shape[0]):
for j in range(input_array.shape[1]):
word = input_array[i][j]
if word in self._token_to_idx:
char_array[i][j] = self._word_char_ids[self._token_to_idx[word]]
else:
word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length - 2)]
char_array[i][j][0] = self.bow_char
for k, chr_id in enumerate(word_encoded, start=1):
char_array[i][j][k] = chr_id
char_array[i][j][k + 1] = self.eow_char

char_array += 1
return char_array

def dataset_to_char_ids(self, dataset, batch_size, sample_len, max_word_length):
char_dataset = nd.full((batch_size, sample_len, max_word_length), self.pad_char)

for i, word in enumerate(dataset):
if word in self._token_to_idx:
char_dataset[i // sample_len][i % sample_len] = self._word_char_ids[self._token_to_idx[word]]
else:
word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length - 2)]
char_dataset[i // sample_len][i % sample_len][0] = self.bow_char
for k, chr_id in enumerate(word_encoded, start=1):
char_dataset[i // sample_len][i % sample_len][k] = chr_id
char_dataset[i // sample_len][i % sample_len][k + 1] = self.eow_char

char_dataset += 1

return char_dataset

def encode_chars(self, sentence, reverse=False, split=True):
'''
Encode the sentence as a white space delimited string of tokens.
'''
if split:
chars_ids = [self.word_to_char_ids(cur_word)
for cur_word in sentence.split()]
else:
chars_ids = [self.word_to_char_ids(cur_word)
for cur_word in sentence]
if reverse:
return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars])
else:
return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])

def __getitem__(self, tokens):
"""Looks up indices of text tokens according to the vocabulary.

If `unknown_token` of the vocabulary is None, looking up unknown tokens results in KeyError.

Parameters
----------
tokens : str or list of strs
A source token or tokens to be converted.


Returns
-------
int or list of ints
A token index or a list of token indices according to the vocabulary.
"""

if isinstance(tokens, (list, tuple)):
return [self._token_to_idx[token] for token in tokens]
elif isinstance(tokens, np.ndarray):
vfunc = np.vectorize(self._token_to_idx.__getitem__)
return vfunc(tokens)
else:
return self._token_to_idx[tokens]
96 changes: 96 additions & 0 deletions scripts/language_model/ELMo-biLM/LSTMPCellWithClip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from mxnet.gluon.contrib.rnn import LSTMPCell

class LSTMPCellWithClip(LSTMPCell):
r"""Long-Short Term Memory Projected (LSTMP) network cell.
(https://arxiv.org/abs/1402.1128)
Each call computes the following function:
.. math::
\begin{array}{ll}
i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{ri} r_{(t-1)} + b_{ri}) \\
f_t = sigmoid(W_{if} x_t + b_{if} + W_{rf} r_{(t-1)} + b_{rf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{rc} r_{(t-1)} + b_{rg}}) \\
o_t = sigmoid(W_{io} x_t + b_{io} + W_{ro} r_{(t-1)} + b_{ro}) \\
c_t = f_t * c_{(t-1)} + i_t * g_t \\
h_t = o_t * \tanh(c_t) \\
r_t = W_{hr} h_t
\end{array}
where :math:`r_t` is the projected recurrent activation at time `t`,
math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
cell state at time `t`, :math:`x_t` is the input at time `t`, and :math:`i_t`,
:math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
out gates, respectively.
Parameters
----------
hidden_size : int
Number of units in cell state symbol.
projection_size : int
Number of units in output symbol.
i2h_weight_initializer : str or Initializer
Initializer for the input weights matrix, used for the linear
transformation of the inputs.
h2h_weight_initializer : str or Initializer
Initializer for the recurrent weights matrix, used for the linear
transformation of the hidden state.
h2r_weight_initializer : str or Initializer
Initializer for the projection weights matrix, used for the linear
transformation of the recurrent state.
i2h_bias_initializer : str or Initializer, default 'lstmbias'
Initializer for the bias vector. By default, bias for the forget
gate is initialized to 1 while all other biases are initialized
to zero.
h2h_bias_initializer : str or Initializer
Initializer for the bias vector.
prefix : str, default 'lstmp_'
Prefix for name of `Block`s
(and name of weight if params is `None`).
params : Parameter or None
Container for weight sharing between cells.
Created if `None`.
Inputs:
- **data**: input tensor with shape `(batch_size, input_size)`.
- **states**: a list of two initial recurrent state tensors, with shape
`(batch_size, projection_size)` and `(batch_size, hidden_size)` respectively.
Outputs:
- **out**: output tensor with shape `(batch_size, num_hidden)`.
- **next_states**: a list of two output recurrent state tensors. Each has
the same shape as `states`.
"""
def __init__(self, hidden_size, projection_size,
i2h_weight_initializer=None, h2h_weight_initializer=None,
h2r_weight_initializer=None,
i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
input_size=0, cell_clip=None, projection_clip=None, prefix=None, params=None):
super(LSTMPCellWithClip, self).__init__(hidden_size, projection_size, i2h_weight_initializer,
h2h_weight_initializer, h2r_weight_initializer, i2h_bias_initializer,
h2h_bias_initializer, input_size, prefix=prefix, params=params)

self._cell_clip = cell_clip
self._projection_clip = projection_clip

# pylint: disable= arguments-differ
def hybrid_forward(self, F, inputs, states, i2h_weight,
h2h_weight, h2r_weight, i2h_bias, h2h_bias):
prefix = 't%d_'%self._counter
i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
num_hidden=self._hidden_size*4, name=prefix+'i2h')
h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
num_hidden=self._hidden_size*4, name=prefix+'h2h')
gates = i2h + h2h
slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice')
in_gate = F.Activation(slice_gates[0], act_type="sigmoid", name=prefix+'i')
forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
in_transform = F.Activation(slice_gates[2], act_type="tanh", name=prefix+'c')
out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
name=prefix+'state')
if self._cell_clip is not None:
F.clip(next_c, a_min=-self._cell_clip, a_max=self._cell_clip, out=next_c)
hidden = F._internal._mul(out_gate, F.Activation(next_c, act_type="tanh"),
name=prefix+'hidden')
next_r = F.FullyConnected(data=hidden, num_hidden=self._projection_size,
weight=h2r_weight, no_bias=True, name=prefix+'out')
if self._projection_clip is not None:
F.clip(next_r, a_min=-self._projection_clip, a_max=self._projection_clip, out=next_r)

return next_r, [next_r, next_c]
# pylint: enable= arguments-differ
Loading