Merge pull request #1 from SuperLinguini/emlo

Embeddings from Language Models (ELMo)
cgraywang · May 10, 2018 · dc494ae · dc494ae
2 parents 7f6068d + ac1252f
commit dc494ae
Show file tree

Hide file tree

Showing 5 changed files with 1,123 additions and 2 deletions.
diff --git a/gluonnlp/data/language_model.py b/gluonnlp/data/language_model.py
@@ -20,13 +20,14 @@
 # pylint: disable=
 """Language model datasets."""
 
-__all__ = ['WikiText2', 'WikiText103']
+__all__ = ['WikiText2', 'WikiText103', 'WikiText2Character']
 
 import os
 import zipfile
 import shutil
 
 from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
+import mxnet as mx
 
 from .. import _constants as C
 from .dataset import LanguageModelDataset
@@ -99,6 +100,53 @@ def __init__(self, segment='train', skip_empty=True, bos=None, eos=C.EOS_TOKEN,
                                     'c7b8ce0aa086fb34dab808c5c49224211eb2b172')}
         super(WikiText2, self).__init__('wikitext-2', segment, bos, eos, skip_empty, root)
 
+@register(segment=['train', 'val', 'test'])
+class WikiText2Character(WikiText2):
+    """WikiText-2 word-level dataset for language modeling, from Salesforce research.
+
+    From
+    https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset
+
+    License: Creative Commons Attribution-ShareAlike
+
+    Parameters
+    ----------
+    segment : str, default 'train'
+        Dataset segment. Options are 'train', 'val', 'test'.
+    skip_empty : bool, default True
+        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
+        will be added in empty samples.
+    bos : str or None, default None
+        The token to add at the begining of each sentence. If None, nothing is added.
+    eos : str or None, default '<eos>'
+        The token to add at the end of each sentence. If None, nothing is added.
+    root : str, default '~/.mxnet/datasets/wikitext-2'
+        Path to temp folder for storing data.
+    """
+    def __init__(self, segment='train', skip_empty=True, bos='<bos>', eos='<eos>',
+                 root=os.path.join('~', '.mxnet', 'datasets', 'wikitext-2')):
+        super(WikiText2Character, self).__init__(segment, skip_empty, bos, eos, root)
+
+    def batchify(self, vocab, batch_size, max_word_length=50, load=None):
+        """Transform the dataset into N independent sequences, where N is the batch size.
+
+        Parameters
+        ----------
+        vocab : gluonnlp.Vocab
+            The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
+            index according to the vocabulary.
+        batch_size : int
+            The number of samples in each batch.
+
+        Returns
+        -------
+        NDArray of shape (num_tokens // N, N). Excessive tokens that don't align along
+        the batches are discarded.
+        """
+        data = self._data[0]
+        sample_len = len(data) // batch_size
+        return vocab.dataset_to_char_ids(data[:sample_len*batch_size], batch_size, sample_len, max_word_length).swapaxes(0, 1), mx.nd.array(vocab[data[:sample_len*batch_size]]).reshape(batch_size, -1).T
+
 
 @register(segment=['train', 'val', 'test'])
 class WikiText103(_WikiText):

diff --git a/gluonnlp/vocab.py b/gluonnlp/vocab.py
@@ -23,12 +23,13 @@
 from __future__ import absolute_import
 from __future__ import print_function
 
-__all__ = ['Vocab']
+__all__ = ['Vocab', 'UnicodeCharsVocabulary']
 
 import json
 import warnings
 
 from mxnet import nd
+import numpy as np
 
 from .data.utils import DefaultLookupDict
 from . import _constants as C
@@ -445,3 +446,156 @@ def from_json(json_str):
         vocab._bos_token = vocab_dict.get('bos_token')
         vocab._eos_token = vocab_dict.get('eos_token')
         return vocab
+
+class UnicodeCharsVocabulary(Vocab):
+    """Vocabulary containing character-level and word level information.
+
+    Has a word vocabulary that is used to lookup word ids and
+    a character id that is used to map words to arrays of character ids.
+
+    The character ids are defined by ord(c) for c in word.encode('utf-8')
+    This limits the total number of possible char ids to 256.
+    To this we add 5 additional special ids: begin sentence, end sentence,
+        begin word, end word and padding.
+    """
+    def __init__(self, counter=None, max_word_length=50, max_size=None, min_freq=1, unknown_token='<unk>',
+                 padding_token='<pad>', bos_token='<bos>', eos_token='<eos>', reserved_tokens=None):
+        super(UnicodeCharsVocabulary, self).__init__(counter, max_size, min_freq, unknown_token, padding_token,
+                                                     bos_token, eos_token, reserved_tokens)
+        self._max_word_length = max_word_length
+
+        # char ids 0-255 come from utf-8 encoding bytes
+        # assign 256-300 to special chars
+        self.bos_char = 256  # <begin sentence>
+        self.eos_char = 257  # <end sentence>
+        self.bow_char = 258  # <begin word>
+        self.eow_char = 259  # <end word>
+        self.pad_char = 260 # <padding>
+
+        if counter:
+            self.num_words = self.__len__()
+
+            self._word_char_ids = np.zeros([self.num_words, max_word_length],
+                dtype=np.int32)
+
+            # the charcter representation of the begin/end of sentence characters
+            def _make_bos_eos(c):
+                r = np.zeros([self.max_word_length], dtype=np.int32)
+                r[:] = self.pad_char
+                r[0] = self.bow_char
+                r[1] = c
+                r[2] = self.eow_char
+                return r
+            self.bos_chars = _make_bos_eos(self.bos_char)
+            self.eos_chars = _make_bos_eos(self.eos_char)
+
+            for i, word in enumerate(self._token_to_idx):
+                self._word_char_ids[i] = self._convert_word_to_char_ids(word)
+
+            self._word_char_ids[self._token_to_idx[self.bos_token]] = self.bos_chars
+            self._word_char_ids[self._token_to_idx[self.eos_token]] = self.eos_chars
+
+    @property
+    def word_char_ids(self):
+        return self._word_char_ids
+
+    @property
+    def size(self):
+        return self.num_words
+
+    @property
+    def max_word_length(self):
+        return self._max_word_length
+
+    def _convert_word_to_char_ids(self, word):
+        code = np.zeros([self.max_word_length], dtype=np.int32)
+        code[:] = self.pad_char
+
+        word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length-2)]
+        code[0] = self.bow_char
+        for k, chr_id in enumerate(word_encoded, start=1):
+            code[k] = chr_id
+        code[k + 1] = self.eow_char
+
+        return code
+
+    def word_to_char_ids(self, word):
+        if word in self._token_to_idx:
+            return self._word_char_ids[self._token_to_idx[word]]
+        else:
+            return self._convert_word_to_char_ids(word)
+
+    def array_to_char_ids(self, input_array, max_word_length):
+        char_array = nd.full((input_array.shape[0], input_array.shape[1], max_word_length), self.pad_char)
+
+        for i in range(input_array.shape[0]):
+            for j in range(input_array.shape[1]):
+                word = input_array[i][j]
+                if word in self._token_to_idx:
+                    char_array[i][j] = self._word_char_ids[self._token_to_idx[word]]
+                else:
+                    word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length - 2)]
+                    char_array[i][j][0] = self.bow_char
+                    for k, chr_id in enumerate(word_encoded, start=1):
+                        char_array[i][j][k] = chr_id
+                    char_array[i][j][k + 1] = self.eow_char
+
+        char_array += 1
+        return char_array
+
+    def dataset_to_char_ids(self, dataset, batch_size, sample_len, max_word_length):
+        char_dataset = nd.full((batch_size, sample_len, max_word_length), self.pad_char)
+
+        for i, word in enumerate(dataset):
+            if word in self._token_to_idx:
+                char_dataset[i // sample_len][i % sample_len] = self._word_char_ids[self._token_to_idx[word]]
+            else:
+                word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length - 2)]
+                char_dataset[i // sample_len][i % sample_len][0] = self.bow_char
+                for k, chr_id in enumerate(word_encoded, start=1):
+                    char_dataset[i // sample_len][i % sample_len][k] = chr_id
+                char_dataset[i // sample_len][i % sample_len][k + 1] = self.eow_char
+
+        char_dataset += 1
+
+        return char_dataset
+
+    def encode_chars(self, sentence, reverse=False, split=True):
+        '''
+        Encode the sentence as a white space delimited string of tokens.
+        '''
+        if split:
+            chars_ids = [self.word_to_char_ids(cur_word)
+                     for cur_word in sentence.split()]
+        else:
+            chars_ids = [self.word_to_char_ids(cur_word)
+                     for cur_word in sentence]
+        if reverse:
+            return np.vstack([self.eos_chars] + chars_ids + [self.bos_chars])
+        else:
+            return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])
+
+    def __getitem__(self, tokens):
+        """Looks up indices of text tokens according to the vocabulary.
+
+        If `unknown_token` of the vocabulary is None, looking up unknown tokens results in KeyError.
+
+        Parameters
+        ----------
+        tokens : str or list of strs
+            A source token or tokens to be converted.
+
+
+        Returns
+        -------
+        int or list of ints
+            A token index or a list of token indices according to the vocabulary.
+        """
+
+        if isinstance(tokens, (list, tuple)):
+            return [self._token_to_idx[token] for token in tokens]
+        elif isinstance(tokens, np.ndarray):
+            vfunc = np.vectorize(self._token_to_idx.__getitem__)
+            return vfunc(tokens)
+        else:
+            return self._token_to_idx[tokens]
diff --git a/scripts/language_model/ELMo-biLM/LSTMPCellWithClip.py b/scripts/language_model/ELMo-biLM/LSTMPCellWithClip.py
@@ -0,0 +1,96 @@
+from mxnet.gluon.contrib.rnn import LSTMPCell
+
+class LSTMPCellWithClip(LSTMPCell):
+    r"""Long-Short Term Memory Projected (LSTMP) network cell.
+    (https://arxiv.org/abs/1402.1128)
+    Each call computes the following function:
+    .. math::
+        \begin{array}{ll}
+        i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{ri} r_{(t-1)} + b_{ri}) \\
+        f_t = sigmoid(W_{if} x_t + b_{if} + W_{rf} r_{(t-1)} + b_{rf}) \\
+        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{rc} r_{(t-1)} + b_{rg}}) \\
+        o_t = sigmoid(W_{io} x_t + b_{io} + W_{ro} r_{(t-1)} + b_{ro}) \\
+        c_t = f_t * c_{(t-1)} + i_t * g_t \\
+        h_t = o_t * \tanh(c_t) \\
+        r_t = W_{hr} h_t
+        \end{array}
+    where :math:`r_t` is the projected recurrent activation at time `t`,
+    math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
+    cell state at time `t`, :math:`x_t` is the input at time `t`, and :math:`i_t`,
+    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
+    out gates, respectively.
+    Parameters
+    ----------
+    hidden_size : int
+        Number of units in cell state symbol.
+    projection_size : int
+        Number of units in output symbol.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the hidden state.
+    h2r_weight_initializer : str or Initializer
+        Initializer for the projection weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer, default 'lstmbias'
+        Initializer for the bias vector. By default, bias for the forget
+        gate is initialized to 1 while all other biases are initialized
+        to zero.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    prefix : str, default 'lstmp_'
+        Prefix for name of `Block`s
+        (and name of weight if params is `None`).
+    params : Parameter or None
+        Container for weight sharing between cells.
+        Created if `None`.
+    Inputs:
+        - **data**: input tensor with shape `(batch_size, input_size)`.
+        - **states**: a list of two initial recurrent state tensors, with shape
+          `(batch_size, projection_size)` and `(batch_size, hidden_size)` respectively.
+    Outputs:
+        - **out**: output tensor with shape `(batch_size, num_hidden)`.
+        - **next_states**: a list of two output recurrent state tensors. Each has
+          the same shape as `states`.
+    """
+    def __init__(self, hidden_size, projection_size,
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 h2r_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 input_size=0, cell_clip=None, projection_clip=None, prefix=None, params=None):
+        super(LSTMPCellWithClip, self).__init__(hidden_size, projection_size, i2h_weight_initializer,
+                                                h2h_weight_initializer, h2r_weight_initializer, i2h_bias_initializer,
+                                                h2h_bias_initializer, input_size, prefix=prefix, params=params)
+
+        self._cell_clip = cell_clip
+        self._projection_clip = projection_clip
+
+    # pylint: disable= arguments-differ
+    def hybrid_forward(self, F, inputs, states, i2h_weight,
+                       h2h_weight, h2r_weight, i2h_bias, h2h_bias):
+        prefix = 't%d_'%self._counter
+        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
+                               num_hidden=self._hidden_size*4, name=prefix+'i2h')
+        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
+                               num_hidden=self._hidden_size*4, name=prefix+'h2h')
+        gates = i2h + h2h
+        slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice')
+        in_gate = F.Activation(slice_gates[0], act_type="sigmoid", name=prefix+'i')
+        forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
+        in_transform = F.Activation(slice_gates[2], act_type="tanh", name=prefix+'c')
+        out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
+        next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
+                                   name=prefix+'state')
+        if self._cell_clip is not None:
+            F.clip(next_c, a_min=-self._cell_clip, a_max=self._cell_clip, out=next_c)
+        hidden = F._internal._mul(out_gate, F.Activation(next_c, act_type="tanh"),
+                                  name=prefix+'hidden')
+        next_r = F.FullyConnected(data=hidden, num_hidden=self._projection_size,
+                                  weight=h2r_weight, no_bias=True, name=prefix+'out')
+        if self._projection_clip is not None:
+            F.clip(next_r, a_min=-self._projection_clip, a_max=self._projection_clip, out=next_r)
+
+        return next_r, [next_r, next_c]
+    # pylint: enable= arguments-differ