Skip to content

Commit

Permalink
Merge pull request #1 from SuperLinguini/emlo
Browse files Browse the repository at this point in the history
Embeddings from Language Models (ELMo)
  • Loading branch information
cgraywang authored May 10, 2018
2 parents 7f6068d + ac1252f commit dc494ae
Show file tree
Hide file tree
Showing 5 changed files with 1,123 additions and 2 deletions.
50 changes: 49 additions & 1 deletion gluonnlp/data/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@
# pylint: disable=
"""Language model datasets."""

__all__ = ['WikiText2', 'WikiText103']
__all__ = ['WikiText2', 'WikiText103', 'WikiText2Character']

import os
import zipfile
import shutil

from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
import mxnet as mx

from .. import _constants as C
from .dataset import LanguageModelDataset
Expand Down Expand Up @@ -99,6 +100,53 @@ def __init__(self, segment='train', skip_empty=True, bos=None, eos=C.EOS_TOKEN,
'c7b8ce0aa086fb34dab808c5c49224211eb2b172')}
super(WikiText2, self).__init__('wikitext-2', segment, bos, eos, skip_empty, root)

@register(segment=['train', 'val', 'test'])
class WikiText2Character(WikiText2):
"""WikiText-2 word-level dataset for language modeling, from Salesforce research.
From
https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset
License: Creative Commons Attribution-ShareAlike
Parameters
----------
segment : str, default 'train'
Dataset segment. Options are 'train', 'val', 'test'.
skip_empty : bool, default True
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
bos : str or None, default None
The token to add at the begining of each sentence. If None, nothing is added.
eos : str or None, default '<eos>'
The token to add at the end of each sentence. If None, nothing is added.
root : str, default '~/.mxnet/datasets/wikitext-2'
Path to temp folder for storing data.
"""
def __init__(self, segment='train', skip_empty=True, bos='<bos>', eos='<eos>',
root=os.path.join('~', '.mxnet', 'datasets', 'wikitext-2')):
super(WikiText2Character, self).__init__(segment, skip_empty, bos, eos, root)

def batchify(self, vocab, batch_size, max_word_length=50, load=None):
"""Transform the dataset into N independent sequences, where N is the batch size.
Parameters
----------
vocab : gluonnlp.Vocab
The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
index according to the vocabulary.
batch_size : int
The number of samples in each batch.
Returns
-------
NDArray of shape (num_tokens // N, N). Excessive tokens that don't align along
the batches are discarded.
"""
data = self._data[0]
sample_len = len(data) // batch_size
return vocab.dataset_to_char_ids(data[:sample_len*batch_size], batch_size, sample_len, max_word_length).swapaxes(0, 1), mx.nd.array(vocab[data[:sample_len*batch_size]]).reshape(batch_size, -1).T


@register(segment=['train', 'val', 'test'])
class WikiText103(_WikiText):
Expand Down
156 changes: 155 additions & 1 deletion gluonnlp/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@
from __future__ import absolute_import
from __future__ import print_function

__all__ = ['Vocab']
__all__ = ['Vocab', 'UnicodeCharsVocabulary']

import json
import warnings

from mxnet import nd
import numpy as np

from .data.utils import DefaultLookupDict
from . import _constants as C
Expand Down Expand Up @@ -445,3 +446,156 @@ def from_json(json_str):
vocab._bos_token = vocab_dict.get('bos_token')
vocab._eos_token = vocab_dict.get('eos_token')
return vocab

class UnicodeCharsVocabulary(Vocab):
"""Vocabulary containing character-level and word level information.
Has a word vocabulary that is used to lookup word ids and
a character id that is used to map words to arrays of character ids.
The character ids are defined by ord(c) for c in word.encode('utf-8')
This limits the total number of possible char ids to 256.
To this we add 5 additional special ids: begin sentence, end sentence,
begin word, end word and padding.
"""
def __init__(self, counter=None, max_word_length=50, max_size=None, min_freq=1, unknown_token='<unk>',
padding_token='<pad>', bos_token='<bos>', eos_token='<eos>', reserved_tokens=None):
super(UnicodeCharsVocabulary, self).__init__(counter, max_size, min_freq, unknown_token, padding_token,
bos_token, eos_token, reserved_tokens)
self._max_word_length = max_word_length

# char ids 0-255 come from utf-8 encoding bytes
# assign 256-300 to special chars
self.bos_char = 256 # <begin sentence>
self.eos_char = 257 # <end sentence>
self.bow_char = 258 # <begin word>
self.eow_char = 259 # <end word>
self.pad_char = 260 # <padding>

if counter:
self.num_words = self.__len__()

self._word_char_ids = np.zeros([self.num_words, max_word_length],
dtype=np.int32)

# the charcter representation of the begin/end of sentence characters
def _make_bos_eos(c):
r = np.zeros([self.max_word_length], dtype=np.int32)
r[:] = self.pad_char
r[0] = self.bow_char
r[1] = c
r[2] = self.eow_char
return r
self.bos_chars = _make_bos_eos(self.bos_char)
self.eos_chars = _make_bos_eos(self.eos_char)

for i, word in enumerate(self._token_to_idx):
self._word_char_ids[i] = self._convert_word_to_char_ids(word)

self._word_char_ids[self._token_to_idx[self.bos_token]] = self.bos_chars
self._word_char_ids[self._token_to_idx[self.eos_token]] = self.eos_chars

@property
def word_char_ids(self):
return self._word_char_ids

@property
def size(self):
return self.num_words

@property
def max_word_length(self):
return self._max_word_length

def _convert_word_to_char_ids(self, word):
code = np.zeros([self.max_word_length], dtype=np.int32)
code[:] = self.pad_char

word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)]
code[0] = self.bow_char
for k, chr_id in enumerate(word_encoded, start=1):
code[k] = chr_id
code[k + 1] = self.eow_char

return code

def word_to_char_ids(self, word):
if word in self._token_to_idx:
return self._word_char_ids[self._token_to_idx[word]]
else:
return self._convert_word_to_char_ids(word)

def array_to_char_ids(self, input_array, max_word_length):
char_array = nd.full((input_array.shape[0], input_array.shape[1], max_word_length), self.pad_char)

for i in range(input_array.shape[0]):
for j in range(input_array.shape[1]):
word = input_array[i][j]
if word in self._token_to_idx:
char_array[i][j] = self._word_char_ids[self._token_to_idx[word]]
else:
word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length - 2)]
char_array[i][j][0] = self.bow_char
for k, chr_id in enumerate(word_encoded, start=1):
char_array[i][j][k] = chr_id
char_array[i][j][k + 1] = self.eow_char

char_array += 1
return char_array

def dataset_to_char_ids(self, dataset, batch_size, sample_len, max_word_length):
char_dataset = nd.full((batch_size, sample_len, max_word_length), self.pad_char)

for i, word in enumerate(dataset):
if word in self._token_to_idx:
char_dataset[i // sample_len][i % sample_len] = self._word_char_ids[self._token_to_idx[word]]
else:
word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length - 2)]
char_dataset[i // sample_len][i % sample_len][0] = self.bow_char
for k, chr_id in enumerate(word_encoded, start=1):
char_dataset[i // sample_len][i % sample_len][k] = chr_id
char_dataset[i // sample_len][i % sample_len][k + 1] = self.eow_char

char_dataset += 1

return char_dataset

def encode_chars(self, sentence, reverse=False, split=True):
'''
Encode the sentence as a white space delimited string of tokens.
'''
if split:
chars_ids = [self.word_to_char_ids(cur_word)
for cur_word in sentence.split()]
else:
chars_ids = [self.word_to_char_ids(cur_word)
for cur_word in sentence]
if reverse:
return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars])
else:
return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])

def __getitem__(self, tokens):
"""Looks up indices of text tokens according to the vocabulary.
If `unknown_token` of the vocabulary is None, looking up unknown tokens results in KeyError.
Parameters
----------
tokens : str or list of strs
A source token or tokens to be converted.
Returns
-------
int or list of ints
A token index or a list of token indices according to the vocabulary.
"""

if isinstance(tokens, (list, tuple)):
return [self._token_to_idx[token] for token in tokens]
elif isinstance(tokens, np.ndarray):
vfunc = np.vectorize(self._token_to_idx.__getitem__)
return vfunc(tokens)
else:
return self._token_to_idx[tokens]
96 changes: 96 additions & 0 deletions scripts/language_model/ELMo-biLM/LSTMPCellWithClip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from mxnet.gluon.contrib.rnn import LSTMPCell

class LSTMPCellWithClip(LSTMPCell):
r"""Long-Short Term Memory Projected (LSTMP) network cell.
(https://arxiv.org/abs/1402.1128)
Each call computes the following function:
.. math::
\begin{array}{ll}
i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{ri} r_{(t-1)} + b_{ri}) \\
f_t = sigmoid(W_{if} x_t + b_{if} + W_{rf} r_{(t-1)} + b_{rf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{rc} r_{(t-1)} + b_{rg}}) \\
o_t = sigmoid(W_{io} x_t + b_{io} + W_{ro} r_{(t-1)} + b_{ro}) \\
c_t = f_t * c_{(t-1)} + i_t * g_t \\
h_t = o_t * \tanh(c_t) \\
r_t = W_{hr} h_t
\end{array}
where :math:`r_t` is the projected recurrent activation at time `t`,
math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
cell state at time `t`, :math:`x_t` is the input at time `t`, and :math:`i_t`,
:math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
out gates, respectively.
Parameters
----------
hidden_size : int
Number of units in cell state symbol.
projection_size : int
Number of units in output symbol.
i2h_weight_initializer : str or Initializer
Initializer for the input weights matrix, used for the linear
transformation of the inputs.
h2h_weight_initializer : str or Initializer
Initializer for the recurrent weights matrix, used for the linear
transformation of the hidden state.
h2r_weight_initializer : str or Initializer
Initializer for the projection weights matrix, used for the linear
transformation of the recurrent state.
i2h_bias_initializer : str or Initializer, default 'lstmbias'
Initializer for the bias vector. By default, bias for the forget
gate is initialized to 1 while all other biases are initialized
to zero.
h2h_bias_initializer : str or Initializer
Initializer for the bias vector.
prefix : str, default 'lstmp_'
Prefix for name of `Block`s
(and name of weight if params is `None`).
params : Parameter or None
Container for weight sharing between cells.
Created if `None`.
Inputs:
- **data**: input tensor with shape `(batch_size, input_size)`.
- **states**: a list of two initial recurrent state tensors, with shape
`(batch_size, projection_size)` and `(batch_size, hidden_size)` respectively.
Outputs:
- **out**: output tensor with shape `(batch_size, num_hidden)`.
- **next_states**: a list of two output recurrent state tensors. Each has
the same shape as `states`.
"""
def __init__(self, hidden_size, projection_size,
i2h_weight_initializer=None, h2h_weight_initializer=None,
h2r_weight_initializer=None,
i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
input_size=0, cell_clip=None, projection_clip=None, prefix=None, params=None):
super(LSTMPCellWithClip, self).__init__(hidden_size, projection_size, i2h_weight_initializer,
h2h_weight_initializer, h2r_weight_initializer, i2h_bias_initializer,
h2h_bias_initializer, input_size, prefix=prefix, params=params)

self._cell_clip = cell_clip
self._projection_clip = projection_clip

# pylint: disable= arguments-differ
def hybrid_forward(self, F, inputs, states, i2h_weight,
h2h_weight, h2r_weight, i2h_bias, h2h_bias):
prefix = 't%d_'%self._counter
i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
num_hidden=self._hidden_size*4, name=prefix+'i2h')
h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
num_hidden=self._hidden_size*4, name=prefix+'h2h')
gates = i2h + h2h
slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice')
in_gate = F.Activation(slice_gates[0], act_type="sigmoid", name=prefix+'i')
forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
in_transform = F.Activation(slice_gates[2], act_type="tanh", name=prefix+'c')
out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
name=prefix+'state')
if self._cell_clip is not None:
F.clip(next_c, a_min=-self._cell_clip, a_max=self._cell_clip, out=next_c)
hidden = F._internal._mul(out_gate, F.Activation(next_c, act_type="tanh"),
name=prefix+'hidden')
next_r = F.FullyConnected(data=hidden, num_hidden=self._projection_size,
weight=h2r_weight, no_bias=True, name=prefix+'out')
if self._projection_clip is not None:
F.clip(next_r, a_min=-self._projection_clip, a_max=self._projection_clip, out=next_r)

return next_r, [next_r, next_c]
# pylint: enable= arguments-differ
Loading

0 comments on commit dc494ae

Please sign in to comment.