dmlc · sxjscience · Apr 4, 2018 · Apr 12, 2018 · Apr 12, 2018 · Apr 12, 2018
@@ -0,0 +1,12 @@
+Scripts
+=======
+Here are some useful training scripts.
+
+.. include:: word_language_model.rst
+
+See :download:`this example script <word_language_model.py>`
+
+.. include:: sentiment_analysis.rst
+
+See :download:`this example script <sentiment_analysis.py>`
+
@@ -0,0 +1,341 @@
+"""
+Fine-tune Language Model for Sentiment Analysis
+===============================================
+
+This example shows how to load a language model pre-trained on wikitext-2 in Gluon NLP Toolkit model
+zoo, and reuse the language model encoder for sentiment analysis on IMDB movie reviews dataset.
+"""
+
+# coding: utf-8
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import time
+import random
+import numpy as np
+import mxnet as mx
+from mxnet import gluon, autograd
+from mxnet.gluon import Block, HybridBlock
+from mxnet.gluon.data import SimpleDataset, ArrayDataset, DataLoader
+import gluonnlp
+from gluonnlp.data.sentiment import IMDB
+from gluonnlp.data import batchify as bf
+from gluonnlp.data.transforms import SpacyTokenizer, ClipSequence
+from gluonnlp.data.sampler import FixedBucketSampler, SortedBucketSampler, SortedSampler
+from gluonnlp.data.utils import train_valid_split
+import multiprocessing as mp
+
+np.random.seed(100)
+random.seed(100)
+mx.random.seed(10000)
+
+tokenizer = SpacyTokenizer('en')
+length_clip = ClipSequence(500)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MXNet Sentiment Analysis Example on IMDB. '
+                                                 'We load a LSTM model that is pretrained on WikiText '
+                                                 'as our encoder.')
+    parser.add_argument('--lm_model', type=str, default='standard_lstm_lm_200',
+                        help='type of the pretrained model to load, can be "standard_lstm_200", '
+                             '"standard_lstm_200", etc.')
+    parser.add_argument('--use-mean-pool', type=bool, default=True, help="whether to use mean pooling to aggregate the states from different timestamps.")
+    parser.add_argument('--no_pretrained', action='store_true', help='Turn on the option to just use the structure and not load the pretrained weights.')
+    parser.add_argument('--lr', type=float, default=2.5E-3,
+                        help='initial learning rate')
+    parser.add_argument('--clip', type=float, default=None, help='gradient clipping')
+    parser.add_argument('--bucket_type', type=str, default=None,
+                        help='Can be "fixed" or "sorted"')
+    parser.add_argument('--bucket_num', type=int, default=10, help='The bucket_num if bucket_type is '
+                                                                   '"fixed".')
+    parser.add_argument('--bucket_ratio', type=float, default=0.0,
+                        help='The ratio used in the FixedBucketSampler.')
+    parser.add_argument('--bucket_mult', type=int, default=100,
+                        help='The mult used in the SortedBucketSampler.')
+    parser.add_argument('--valid_ratio', type=float, default=0.05,
+                        help='Proportion [0, 1] of training samples to use for validation set.')
+    parser.add_argument('--epochs', type=int, default=20,
+                        help='upper epoch limit')
+    parser.add_argument('--batch_size', type=int, default=16, metavar='N',
+                        help='batch size')
+    parser.add_argument('--dropout', type=float, default=0.,
+                        help='dropout applied to layers (0 = no dropout)')
+    parser.add_argument('--log-interval', type=int, default=30, metavar='N',
+                        help='report interval')
+    parser.add_argument('--save', type=str, default='model.params',
+                    help='path to save the final model')
+    parser.add_argument('--gpu', type=int, default=None,
+                        help='id of the gpu to use. Set it to empty means to use cpu.')
+    args = parser.parse_args()
+    return args
+
+
+def preprocess(x):
+    data, label = x
+    label = int(label > 5)
+    data = vocab[length_clip(tokenizer(data))]
+    return data, label
+
+
+def get_length(x):
+    return float(len(x[0]))
+
+
+def load_data():
+    # Load the dataset
+    train_dataset, test_dataset = [IMDB(root='data/imdb', segment=segment) for segment in ('train', 'test')]
+    train_dataset, valid_dataset = train_valid_split(train_dataset, args.valid_ratio)
+    print("Tokenize using spaCy...")
+
+    def preprocess_dataset(dataset):
+        start = time.time()
+        with mp.Pool(8) as pool:
+            dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
+            lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
+        end = time.time()
+        print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
+        return dataset, lengths
+
+    # Preprocess the dataset
+    train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
+    valid_dataset, valid_data_lengths = preprocess_dataset(valid_dataset)
+    test_dataset, test_data_lengths = preprocess_dataset(test_dataset)
+    return train_dataset, train_data_lengths, valid_dataset, valid_data_lengths, test_dataset, test_data_lengths
+
+
+class AggregationLayer(HybridBlock):
+    def __init__(self, use_mean_pool=False, prefix=None, params=None):
+        super(AggregationLayer, self).__init__(prefix=prefix, params=params)
+        self._use_mean_pool = use_mean_pool
+
+    def hybrid_forward(self, F, data, valid_length):
+        # Data will have shape (T, N, C)
+        if self._use_mean_pool:
+            masked_encoded = F.SequenceMask(data,
+                                            sequence_length=valid_length,
+                                            use_sequence_length=True)
+            agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
+                                        F.expand_dims(valid_length, axis=1))
+        else:
+            agg_state = F.SequenceLast(data,
+                                       sequence_length=valid_length,
+                                       use_sequence_length=True)
+        return agg_state
+
+
+class SentimentNet(Block):
+    def __init__(self, lm_model, dropout, use_mean_pool=False, prefix=None, params=None):
+        super(SentimentNet, self).__init__(prefix=prefix, params=params)
+        self._use_mean_pool = use_mean_pool
+        with self.name_scope():
+            self.embedding = lm_model.embedding
+            self.encoder = lm_model.encoder
+            self.agg_layer = AggregationLayer(use_mean_pool=use_mean_pool)
+            self.out_layer = gluon.nn.HybridSequential()
+            with self.out_layer.name_scope():
+                self.out_layer.add(gluon.nn.Dropout(dropout))
+                self.out_layer.add(gluon.nn.Dense(1, flatten=False))
+
+    def forward(self, data, valid_length):
+        encoded = self.encoder(self.embedding(data))  # Shape(T, N, C)
+        agg_state = self.agg_layer(encoded, valid_length)
+        out = self.out_layer(agg_state)
+        return out
+
+
+def evaluate(net, dataloader, context):
+    loss = gluon.loss.SigmoidBCELoss()
+    total_L = 0.0
+    total_sample_num = 0
+    total_correct_num = 0
+    start_log_interval_time = time.time()
+    print('Begin Testing...')
+    for i, ((data, valid_length), label) in enumerate(dataloader):
+        data = mx.nd.transpose(data.as_in_context(context))
+        valid_length = valid_length.as_in_context(context).astype(np.float32)
+        label = label.as_in_context(context)
+        output = net(data, valid_length)
+        L = loss(output, label)
+        pred = (output > 0.5).reshape((-1,))
+        total_L += L.sum().asscalar()
+        total_sample_num += label.shape[0]
+        total_correct_num += (pred == label).sum().asscalar()
+        if (i + 1) % args.log_interval == 0:
+            print('[Batch {}/{}] elapsed {:.2f} s'.format(
+                i + 1, len(dataloader), time.time() - start_log_interval_time))
+            start_log_interval_time = time.time()
+    avg_L = total_L / float(total_sample_num)
+    acc = total_correct_num / float(total_sample_num)
+    return avg_L, acc
+
+
+args = parse_args()
+print(args)
+pretrained = not args.no_pretrained
+# Load the pretrained model
+if args.gpu is None:
+    print("Use cpu")
+    context = mx.cpu()
+else:
+    print("Use gpu%d" % args.gpu)
+    context = mx.gpu(args.gpu)
+lm_model, vocab = gluonnlp.model.get_model(name=args.lm_model,
+                                           dataset_name='wikitext-2',
+                                           pretrained=pretrained,
+                                           ctx=context,
+                                           dropout=args.dropout,
+                                           prefix='sent_net_')
+# Load and preprocess the dataset
+train_dataset, train_data_lengths, \
+valid_dataset, valid_data_lengths, \
+test_dataset, test_data_lengths = load_data()
+
+
+def train():
+    start_pipeline_time = time.time()
+    net = SentimentNet(lm_model=lm_model, dropout=args.dropout, use_mean_pool=args.use_mean_pool,
+                       prefix='sent_net_')
+    net.hybridize()
+    print(net)
+    if args.no_pretrained:
+        net.collect_params().initialize(mx.init.Xavier(), ctx=context)
+    else:
+        net.out_layer.initialize(mx.init.Xavier(), ctx=context)
+    trainer = gluon.Trainer(net.collect_params(), 'ftml', {'learning_rate': args.lr})
+    loss = gluon.loss.SigmoidBCELoss()
+
+    # Construct the DataLoader
+    batchify_fn = bf.Tuple(bf.Pad(axis=0, ret_length=True), bf.Stack())  # Pad data and stack label
+    if args.bucket_type is None:
+        print("Bucketing strategy is not used!")
+        train_dataloader = DataLoader(dataset=train_dataset,
+                                      batch_size=args.batch_size,
+                                      shuffle=True,
+                                      batchify_fn=batchify_fn)
+    else:
+        if args.bucket_type == "fixed":
+            print("Use FixedBucketSampler")
+            batch_sampler = FixedBucketSampler(train_data_lengths,
+                                               batch_size=args.batch_size,
+                                               num_buckets=args.bucket_num,
+                                               ratio=args.bucket_ratio,
+                                               shuffle=True)
+            print(batch_sampler.stats())
+        elif args.bucket_type == "sorted":
+            print("Use SortedBucketSampler")
+            batch_sampler = SortedBucketSampler(train_data_lengths,
+                                                batch_size=args.batch_size,
+                                                mult=args.bucket_mult,
+                                                shuffle=True)
+        else:
+            raise NotImplementedError
+        train_dataloader = DataLoader(dataset=train_dataset,
+                                      batch_sampler=batch_sampler,
+                                      batchify_fn=batchify_fn)
+
+    valid_dataloader = DataLoader(dataset=valid_dataset,
+                                 batch_size=args.batch_size,
+                                 shuffle=False,
+                                 sampler=SortedSampler(valid_data_lengths),
+                                 batchify_fn=batchify_fn)
+
+    test_dataloader = DataLoader(dataset=test_dataset,
+                                 batch_size=args.batch_size,
+                                 shuffle=False,
+                                 sampler=SortedSampler(test_data_lengths),
+                                 batchify_fn=batchify_fn)
+
+    # Training/Testing
+    best_valid_acc = 0
+    stop_early = 0
+    for epoch in range(args.epochs):
+        # Epoch training stats
+        start_epoch_time = time.time()
+        epoch_L = 0.0
+        epoch_sent_num = 0
+        epoch_wc = 0
+        # Log interval training stats
+        start_log_interval_time = time.time()
+        log_interval_wc = 0
+        log_interval_sent_num = 0
+        log_interval_L = 0.0
+
+        for i, ((data, valid_length), label) in enumerate(train_dataloader):
+            data = mx.nd.transpose(data.as_in_context(context))
+            label = label.as_in_context(context)
+            valid_length = valid_length.as_in_context(context).astype(np.float32)
+            wc = valid_length.sum().asscalar()
+            log_interval_wc += wc
+            epoch_wc += wc
+            log_interval_sent_num += data.shape[1]
+            epoch_sent_num += data.shape[1]
+            with autograd.record():
+                output = net(data, valid_length)
+                L = loss(output, label).mean()
+            L.backward()
+            # Clip gradient
+            if args.clip is not None:
+                grads = [p.grad(context) for p in net.collect_params().values()]
+                gluon.utils.clip_global_norm(grads, args.clip)
+            # Update parameter
+            trainer.step(1)
+            log_interval_L += L.asscalar()
+            epoch_L += L.asscalar()
+            if (i + 1) % args.log_interval == 0:
+                print('[Epoch %d Batch %d/%d] avg loss %g, throughput %gK wps' % (
+                    epoch, i + 1, len(train_dataloader),
+                    log_interval_L / log_interval_sent_num,
+                    log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
+                # Clear log interval training stats
+                start_log_interval_time = time.time()
+                log_interval_wc = 0
+                log_interval_sent_num = 0
+                log_interval_L = 0
+        end_epoch_time = time.time()
+        valid_avg_L, valid_acc = evaluate(net, valid_dataloader, context)
+        test_avg_L, test_acc = evaluate(net, test_dataloader, context)
+        print('[Epoch %d] train avg loss %g, valid acc %.4f, valid avg loss %g, test acc %.4f, test avg loss %g, throughput %gK wps' % (
+            epoch, epoch_L / epoch_sent_num,
+            valid_acc, valid_avg_L, test_acc, test_avg_L,
+            epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))
+
+        if valid_acc < best_valid_acc:
+            print("No Improvement.")
+            stop_early += 1
+            if stop_early == 3:
+                break
+        else:
+            # Reset stop_early if the validation loss finds a new low value
+            print("Observe Improvement")
+            stop_early = 0
+            net.save_params(args.save)
+            best_valid_acc = valid_acc
+
+    net.load_params(args.save, context)
+    valid_avg_L, valid_acc = evaluate(net, valid_dataloader, context)
+    test_avg_L, test_acc = evaluate(net, test_dataloader, context)
+    print('Best validation loss %g, validation acc %.4f'%(valid_avg_L, valid_acc))
+    print('Best test loss %g, test acc %.4f'%(test_avg_L, test_acc))
+    print('Total time cost %.2fs'%(time.time()-start_pipeline_time))
+
+
+if __name__ == "__main__":
+    train()
+
@@ -0,0 +1,21 @@
+Sentiment Analysis through Fine-tuning, w/ Bucketing
+----------------------------------------------------
+
+This script can be used to train a sentiment analysis model from scratch, or fine-tune a pre-trained language model.
+The pre-trained language models are loaded from Gluon NLP Toolkit model zoo. It also showcases how to use different
+bucketing strategies to speed up training.
+
+Use the following command to run without using pretrained model
+
+.. code-block:: bash
+
+   $ python sentiment_analysis.py --gpu 0 --batch_size 16 --bucket_type fixed --epochs 20 --dropout 0 --no_pretrained --lr 0.005 --valid_ratio 0.1 --save imdb_lstm_200.params  # Test Accuracy 87.88
+
+Use the following command to run with pretrained model
+
+.. code-block:: bash
+
+   $ python sentiment_analysis.py --gpu 0 --batch_size 16 --bucket_type fixed --epochs 20 --dropout 0 --lr 0.005 --valid_ratio 0.1 --save imdb_lstm_200.params  # Test Accuracy 88.46
+
+
+