From d34285cdf44c1830b351b4069d81d06c13091750 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Mon, 27 May 2019 14:38:14 +0000
Subject: [PATCH 01/18] Fix typos

---
 scripts/bert/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index fa56754fd8..927b2ec9bd 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -346,7 +346,7 @@ Command line interface
 
 .. code-block:: shell
 
-    python bert_embedding/bert.py --sentences "GluonNLP is a toolkit that enables easy text preprocessing, datasets loading and neural models building to help you speed up your Natural Language Processing (NLP) research."
+    python bert/bert.py --sentences "GluonNLP is a toolkit that enables easy text preprocessing, datasets loading and neural models building to help you speed up your Natural Language Processing (NLP) research."
     Text: GluonNLP is a toolkit that enables easy text preprocessing, datasets loading and neural models building to help you speed up your Natural Language Processing (NLP) research.
     Tokens embedding: [array([-0.11881411, -0.59530115,  0.627092  , ...,  0.00648153,
        -0.03886228,  0.03406909], dtype=float32), array([-0.7995638 , -0.6540758 , -0.00521846, ..., -0.42272145,

From 88466b26163e7eed01e543e4987eff0930f2ee20 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 29 May 2019 08:16:15 +0000
Subject: [PATCH 02/18] Refactor argparse

---
 scripts/bert/compare_tf_gluon_model.py | 25 +++++++++++++++----------
 scripts/bert/convert_tf_model.py       | 25 ++++++++++++++-----------
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/scripts/bert/compare_tf_gluon_model.py b/scripts/bert/compare_tf_gluon_model.py
index e35c216d52..f2017f170e 100644
--- a/scripts/bert/compare_tf_gluon_model.py
+++ b/scripts/bert/compare_tf_gluon_model.py
@@ -29,24 +29,29 @@
 
 parser = argparse.ArgumentParser(description='Comparison script for BERT model in Tensorflow'
                                              'and that in Gluon. This script works with '
-                                             'google/bert@f39e881b')
+                                             'google/bert@f39e881b',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument('--input_file', type=str, default='input.txt',
-                    help='sample input file for testing. Default is input.txt')
+                    help='sample input file for testing')
 parser.add_argument('--tf_bert_repo_dir', type=str,
                     default='~/bert/',
                     help='path to the original Tensorflow bert repository. '
-                         'The repo should be at f39e881b. '
-                         'Default is ~/bert/')
+                         'The repo should be at f39e881b.')
 parser.add_argument('--tf_model_dir', type=str,
                     default='~/uncased_L-12_H-768_A-12/',
-                    help='path to the original Tensorflow bert checkpoint directory. '
-                         'Default is ~/uncased_L-12_H-768_A-12/')
+                    help='path to the original Tensorflow bert checkpoint directory.')
+parser.add_argument('--tf_model_prefix', type=str,
+                    default='bert_model.ckpt',
+                    help='name of bert checkpoint file.')
+parser.add_argument('--tf_config_name', type=str,
+                    default='bert_config.json',
+                    help='Name of Bert config file')
 parser.add_argument('--cased', action='store_true',
                     help='if not set, inputs are converted to lower case')
 parser.add_argument('--gluon_dataset', type=str, default='book_corpus_wiki_en_uncased',
-                    help='gluon dataset name. Default is book_corpus_wiki_en_uncased')
+                    help='gluon dataset name')
 parser.add_argument('--gluon_model', type=str, default='bert_12_768_12',
-                    help='gluon model name. Default is bert_12_768_12')
+                    help='gluon model name')
 parser.add_argument('--gluon_parameter_file', type=str, default=None,
                     help='gluon parameter file name.')
 
@@ -56,8 +61,8 @@
 tf_bert_repo_dir = os.path.expanduser(args.tf_bert_repo_dir)
 tf_model_dir = os.path.expanduser(args.tf_model_dir)
 vocab_file = os.path.join(tf_model_dir, 'vocab.txt')
-bert_config_file = os.path.join(tf_model_dir, 'bert_config.json')
-init_checkpoint = os.path.join(tf_model_dir, 'bert_model.ckpt')
+bert_config_file = os.path.join(tf_model_dir, args.tf_config_name)
+init_checkpoint = os.path.join(tf_model_dir, args.tf_model_prefix)
 do_lower_case = not args.cased
 max_length = 128
 
diff --git a/scripts/bert/convert_tf_model.py b/scripts/bert/convert_tf_model.py
index 5cb0a5e7ae..bc276e7bb9 100644
--- a/scripts/bert/convert_tf_model.py
+++ b/scripts/bert/convert_tf_model.py
@@ -27,18 +27,21 @@
 from gluonnlp.model.bert import bert_hparams
 from utils import convert_vocab, get_hash, read_tf_checkpoint
 
-parser = argparse.ArgumentParser(description='Conversion script for Tensorflow BERT model')
-parser.add_argument('--model', type=str, default='bert_12_768_12',
-                    help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.'
-                         'Default is bert_12_768_12')
-parser.add_argument('--tf_checkpoint_dir', type=str,
-                    default=os.path.join('~', 'cased_L-12_H-768_A-12/'),
-                    help='Path to Tensorflow checkpoint folder. '
-                         'Default is /home/ubuntu/cased_L-12_H-768_A-12/')
-parser.add_argument('--out_dir', type=str,
+parser = argparse.ArgumentParser(
+    description='Conversion script for Tensorflow BERT model',
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--model',
+                    type=str,
+                    default='bert_12_768_12',
+                    choices=['bert_12_768_12', 'bert_24_1024_16'],
+                    help='BERT model name')
+parser.add_argument('--tf_checkpoint_dir',
+                    type=str,
+                    help='Path to Tensorflow checkpoint folder.')
+parser.add_argument('--out_dir',
+                    type=str,
                     default=os.path.join('~', 'output'),
-                    help='Path to output folder. The folder must exist. '
-                         'Default is /home/ubuntu/output/')
+                    help='Path to output folder. The folder must exist.')
 parser.add_argument('--debug', action='store_true', help='debugging mode')
 args = parser.parse_args()
 logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)

From 623704891d51b3842c2e8d526109ece1e4eada2d Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 29 May 2019 08:47:35 +0000
Subject: [PATCH 03/18] Validate bert_config.json

---
 scripts/bert/convert_tf_model.py | 37 ++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/scripts/bert/convert_tf_model.py b/scripts/bert/convert_tf_model.py
index bc276e7bb9..a2cffb1501 100644
--- a/scripts/bert/convert_tf_model.py
+++ b/scripts/bert/convert_tf_model.py
@@ -19,9 +19,10 @@
 # pylint:disable=redefined-outer-name,logging-format-interpolation
 """ Script for converting TF Model to Gluon. """
 
-import os
-import logging
 import argparse
+import json
+import logging
+import os
 import mxnet as mx
 from gluonnlp.model import BERTEncoder, BERTModel
 from gluonnlp.model.bert import bert_hparams
@@ -38,6 +39,9 @@
 parser.add_argument('--tf_checkpoint_dir',
                     type=str,
                     help='Path to Tensorflow checkpoint folder.')
+parser.add_argument('--tf_config_name', type=str,
+                    default='bert_config.json',
+                    help='Name of Bert config file')
 parser.add_argument('--out_dir',
                     type=str,
                     default=os.path.join('~', 'output'),
@@ -124,11 +128,32 @@
     embedding[source_idx][:] = dst
     embedding[dst_idx][:] = source
 logging.info('total number of tf parameters = %d', len(tf_tensors))
-logging.info('total number of mx parameters = %d (including decoder param for weight tying)',
-             len(mx_tensors))
-
-# XXX assume no changes in BERT configs
+logging.info(
+    'total number of mx parameters = %d (including decoder param for weight tying)',
+    len(mx_tensors))
+
+# BERT config
+tf_config_names_to_gluon_config_names = {
+    'attention_probs_dropout_prob': 'embed_dropout',
+    'hidden_act': None,
+    'hidden_dropout_prob': 'dropout',
+    'hidden_size': 'units',
+    'initializer_range': None,
+    'intermediate_size': 'hidden_size',
+    'max_position_embeddings': 'max_length',
+    'num_attention_heads': 'num_heads',
+    'num_hidden_layers': 'num_layers',
+    'type_vocab_size': 'token_type_vocab_size',
+    'vocab_size': None
+}
 predefined_args = bert_hparams[args.model]
+with open(os.path.join(args.tf_checkpoint_dir, args.tf_config_name), 'r') as f:
+    tf_config = json.load(f)
+    assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
+    for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
+        if tf_name is None or gluon_name is None:
+            continue
+        assert tf_config[tf_name] == predefined_args[gluon_name]
 
 # BERT encoder
 encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],

From c81c2656fd44fb8fb267a744cad2cf8a1e7904f8 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 29 May 2019 09:07:06 +0000
Subject: [PATCH 04/18] Ignore saved optimizer parameters

Typically Optimizer parameters are not exported for Bert models
(eg. https://github.com/google-research/bert). As this does not always hold, we
have to handle the case that the parameters are still present (eg.
https://github.com/allenai/scibert).
---
 scripts/bert/convert_tf_model.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/scripts/bert/convert_tf_model.py b/scripts/bert/convert_tf_model.py
index a2cffb1501..d6635cdef6 100644
--- a/scripts/bert/convert_tf_model.py
+++ b/scripts/bert/convert_tf_model.py
@@ -71,6 +71,15 @@
 logging.info('loading Tensorflow checkpoint %s ...', tf_checkpoint_file)
 tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
 tf_names = sorted(tf_tensors.keys())
+
+tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
+tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
+tf_names = filter(lambda name: name != 'global_step', tf_names)
+tf_names = list(tf_names)
+if len(tf_tensors) != len(tf_names):
+    logging.info('Tensorflow model was saved with Optimizer parameters. '
+                 'Ignoring them.')
+
 for name in tf_names:
     logging.debug('%s: %s', name, tf_tensors[name].shape)
 
@@ -127,7 +136,7 @@
     dst = embedding[dst_idx].copy()
     embedding[source_idx][:] = dst
     embedding[dst_idx][:] = source
-logging.info('total number of tf parameters = %d', len(tf_tensors))
+logging.info('total number of tf parameters = %d', len(tf_names))
 logging.info(
     'total number of mx parameters = %d (including decoder param for weight tying)',
     len(mx_tensors))

From a41bd817c72156d7de6f18f5556e9dfaa066d709 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 29 May 2019 15:00:46 +0000
Subject: [PATCH 05/18] Make use of flexible vocab in
 scripts/bert/convert_tf_model.py

---
 scripts/bert/convert_tf_model.py | 11 +-----
 scripts/bert/utils.py            | 63 +++++---------------------------
 2 files changed, 11 insertions(+), 63 deletions(-)

diff --git a/scripts/bert/convert_tf_model.py b/scripts/bert/convert_tf_model.py
index d6635cdef6..a4884ebbb0 100644
--- a/scripts/bert/convert_tf_model.py
+++ b/scripts/bert/convert_tf_model.py
@@ -26,7 +26,7 @@
 import mxnet as mx
 from gluonnlp.model import BERTEncoder, BERTModel
 from gluonnlp.model.bert import bert_hparams
-from utils import convert_vocab, get_hash, read_tf_checkpoint
+from utils import load_tf_vocab, tf_vocab_to_gluon_vocab, get_hash, read_tf_checkpoint
 
 parser = argparse.ArgumentParser(
     description='Conversion script for Tensorflow BERT model',
@@ -53,7 +53,7 @@
 
 # convert vocabulary
 vocab_path = os.path.join(args.tf_checkpoint_dir, 'vocab.txt')
-vocab, reserved_token_idx_map = convert_vocab(vocab_path)
+vocab = tf_vocab_to_gluon_vocab(load_tf_vocab(vocab_path))
 
 # vocab serialization
 tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
@@ -128,14 +128,7 @@
 
 # post processings for parameters:
 # - handle tied decoder weight
-# - update word embedding for reserved tokens
 mx_tensors['decoder.3.weight'] = mx_tensors['word_embed.0.weight']
-embedding = mx_tensors['word_embed.0.weight']
-for source_idx, dst_idx in reserved_token_idx_map:
-    source = embedding[source_idx].copy()
-    dst = embedding[dst_idx].copy()
-    embedding[source_idx][:] = dst
-    embedding[dst_idx][:] = source
 logging.info('total number of tf parameters = %d', len(tf_names))
 logging.info(
     'total number of mx parameters = %d (including decoder param for weight tying)',
diff --git a/scripts/bert/utils.py b/scripts/bert/utils.py
index 0549fffcb9..4219f81f10 100644
--- a/scripts/bert/utils.py
+++ b/scripts/bert/utils.py
@@ -25,62 +25,17 @@
 import json
 
 import mxnet as mx
-import gluonnlp
+import gluonnlp as nlp
 
-__all__ = ['convert_vocab']
+__all__ = ['tf_vocab_to_gluon_vocab', 'load_tf_vocab']
 
 
-def convert_vocab(vocab_file):
-    """GluonNLP specific code to convert the original vocabulary to nlp.vocab.BERTVocab."""
-    original_vocab = load_vocab(vocab_file)
-    token_to_idx = dict(original_vocab)
-    num_tokens = len(token_to_idx)
-    idx_to_token = [None] * len(original_vocab)
-    for word in original_vocab:
-        idx = int(original_vocab[word])
-        idx_to_token[idx] = word
-
-    def swap(token, target_idx, token_to_idx, idx_to_token, swap_idx):
-        original_idx = token_to_idx[token]
-        original_token = idx_to_token[target_idx]
-        token_to_idx[token] = target_idx
-        token_to_idx[original_token] = original_idx
-        idx_to_token[target_idx] = token
-        idx_to_token[original_idx] = original_token
-        swap_idx.append((original_idx, target_idx))
-
-    reserved_tokens = [gluonnlp.vocab.bert.PADDING_TOKEN, gluonnlp.vocab.bert.CLS_TOKEN,
-                       gluonnlp.vocab.bert.SEP_TOKEN, gluonnlp.vocab.bert.MASK_TOKEN]
-
-    unknown_token = gluonnlp.vocab.bert.UNKNOWN_TOKEN
-    padding_token = gluonnlp.vocab.bert.PADDING_TOKEN
-    swap_idx = []
-    assert unknown_token in token_to_idx
-    assert padding_token in token_to_idx
-    swap(unknown_token, 0, token_to_idx, idx_to_token, swap_idx)
-    for i, token in enumerate(reserved_tokens):
-        swap(token, i + 1, token_to_idx, idx_to_token, swap_idx)
-
-    # sanity checks
-    assert len(token_to_idx) == num_tokens
-    assert len(idx_to_token) == num_tokens
-    assert None not in idx_to_token
-    assert len(set(idx_to_token)) == num_tokens
-
-    bert_vocab_dict = {}
-    bert_vocab_dict['idx_to_token'] = idx_to_token
-    bert_vocab_dict['token_to_idx'] = token_to_idx
-    bert_vocab_dict['reserved_tokens'] = reserved_tokens
-    bert_vocab_dict['unknown_token'] = unknown_token
-    bert_vocab_dict['padding_token'] = padding_token
-    bert_vocab_dict['bos_token'] = None
-    bert_vocab_dict['eos_token'] = None
-    bert_vocab_dict['mask_token'] = gluonnlp.vocab.bert.MASK_TOKEN
-    bert_vocab_dict['sep_token'] = gluonnlp.vocab.bert.SEP_TOKEN
-    bert_vocab_dict['cls_token'] = gluonnlp.vocab.bert.CLS_TOKEN
-    json_str = json.dumps(bert_vocab_dict)
-    converted_vocab = gluonnlp.vocab.BERTVocab.from_json(json_str)
-    return converted_vocab, swap_idx
+def tf_vocab_to_gluon_vocab(tf_vocab):
+    special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[MASK]', '[CLS]']
+    assert all(t in tf_vocab for t in special_tokens)
+    counter = nlp.data.count_tokens(tf_vocab.keys())
+    vocab = nlp.vocab.BERTVocab(counter, token_to_idx=tf_vocab)
+    return vocab
 
 
 def get_hash(filename):
@@ -122,7 +77,7 @@ def profile(curr_step, start_step, end_step, profile_name='profile.json',
         if early_exit:
             exit()
 
-def load_vocab(vocab_file):
+def load_tf_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
     index = 0

From cfeac29754e7ef2417fd3733ee6ce0ec74c16d41 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 29 May 2019 10:18:22 +0000
Subject: [PATCH 06/18] Support non-standard datasets/vocabularies in
 compare_tf_gluon_model

---
 scripts/bert/compare_tf_gluon_model.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/bert/compare_tf_gluon_model.py b/scripts/bert/compare_tf_gluon_model.py
index f2017f170e..ca6956af72 100644
--- a/scripts/bert/compare_tf_gluon_model.py
+++ b/scripts/bert/compare_tf_gluon_model.py
@@ -27,6 +27,8 @@
 import mxnet as mx
 import gluonnlp as nlp
 
+from utils import tf_vocab_to_gluon_vocab
+
 parser = argparse.ArgumentParser(description='Comparison script for BERT model in Tensorflow'
                                              'and that in Gluon. This script works with '
                                              'google/bert@f39e881b',
@@ -134,8 +136,10 @@
 #                               Gluon MODEL                                   #
 ###############################################################################
 
+vocabulary = tf_vocab_to_gluon_vocab(tokenizer.vocab)
 bert, vocabulary = nlp.model.get_model(args.gluon_model,
-                                       dataset_name=args.gluon_dataset,
+                                       dataset_name=None,
+                                       vocab=vocabulary,
                                        pretrained=not args.gluon_parameter_file,
                                        use_pooler=False,
                                        use_decoder=False,

From e1ef91c1e9b05c53af5ae854884137acfd0bd8e0 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Mon, 3 Jun 2019 13:53:06 +0000
Subject: [PATCH 07/18] Address comments

---
 .../{ => conversion_tools}/compare_tf_gluon_model.py   |  2 ++
 .../bert/{ => conversion_tools}/convert_tf_model.py    | 10 +++++++++-
 scripts/bert/index.rst                                 |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)
 rename scripts/bert/{ => conversion_tools}/compare_tf_gluon_model.py (98%)
 rename scripts/bert/{ => conversion_tools}/convert_tf_model.py (97%)

diff --git a/scripts/bert/compare_tf_gluon_model.py b/scripts/bert/conversion_tools/compare_tf_gluon_model.py
similarity index 98%
rename from scripts/bert/compare_tf_gluon_model.py
rename to scripts/bert/conversion_tools/compare_tf_gluon_model.py
index ca6956af72..015a509f18 100644
--- a/scripts/bert/compare_tf_gluon_model.py
+++ b/scripts/bert/conversion_tools/compare_tf_gluon_model.py
@@ -22,11 +22,13 @@
 
 import sys
 import os
+# import os.path
 import argparse
 import numpy as np
 import mxnet as mx
 import gluonnlp as nlp
 
+sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
 from utils import tf_vocab_to_gluon_vocab
 
 parser = argparse.ArgumentParser(description='Comparison script for BERT model in Tensorflow'
diff --git a/scripts/bert/convert_tf_model.py b/scripts/bert/conversion_tools/convert_tf_model.py
similarity index 97%
rename from scripts/bert/convert_tf_model.py
rename to scripts/bert/conversion_tools/convert_tf_model.py
index a4884ebbb0..bd0b540f9e 100644
--- a/scripts/bert/convert_tf_model.py
+++ b/scripts/bert/conversion_tools/convert_tf_model.py
@@ -23,10 +23,18 @@
 import json
 import logging
 import os
+import sys
+
 import mxnet as mx
+
 from gluonnlp.model import BERTEncoder, BERTModel
 from gluonnlp.model.bert import bert_hparams
-from utils import load_tf_vocab, tf_vocab_to_gluon_vocab, get_hash, read_tf_checkpoint
+
+sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
+
+from utils import (get_hash, load_tf_vocab, read_tf_checkpoint,
+                   tf_vocab_to_gluon_vocab)
+
 
 parser = argparse.ArgumentParser(
     description='Conversion script for Tensorflow BERT model',
diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index 927b2ec9bd..ffb3947bda 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -346,7 +346,7 @@ Command line interface
 
 .. code-block:: shell
 
-    python bert/bert.py --sentences "GluonNLP is a toolkit that enables easy text preprocessing, datasets loading and neural models building to help you speed up your Natural Language Processing (NLP) research."
+    python bert/embedding.py --sentences "GluonNLP is a toolkit that enables easy text preprocessing, datasets loading and neural models building to help you speed up your Natural Language Processing (NLP) research."
     Text: GluonNLP is a toolkit that enables easy text preprocessing, datasets loading and neural models building to help you speed up your Natural Language Processing (NLP) research.
     Tokens embedding: [array([-0.11881411, -0.59530115,  0.627092  , ...,  0.00648153,
        -0.03886228,  0.03406909], dtype=float32), array([-0.7995638 , -0.6540758 , -0.00521846, ..., -0.42272145,

From c32f5387c56b8202c1a9f1d59419dd6ba31cfb20 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Wed, 5 Jun 2019 15:28:05 +0000
Subject: [PATCH 08/18] Fix lint

---
 scripts/bert/conversion_tools/compare_tf_gluon_model.py | 1 -
 scripts/bert/utils.py                                   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/scripts/bert/conversion_tools/compare_tf_gluon_model.py b/scripts/bert/conversion_tools/compare_tf_gluon_model.py
index 015a509f18..b695aa81cd 100644
--- a/scripts/bert/conversion_tools/compare_tf_gluon_model.py
+++ b/scripts/bert/conversion_tools/compare_tf_gluon_model.py
@@ -22,7 +22,6 @@
 
 import sys
 import os
-# import os.path
 import argparse
 import numpy as np
 import mxnet as mx
diff --git a/scripts/bert/utils.py b/scripts/bert/utils.py
index 4219f81f10..62c8ea3730 100644
--- a/scripts/bert/utils.py
+++ b/scripts/bert/utils.py
@@ -22,7 +22,6 @@
 import collections
 import hashlib
 import io
-import json
 
 import mxnet as mx
 import gluonnlp as nlp

From 19ff84ab51dd32d29031284559587b8bdec0f075 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 7 Jun 2019 07:33:16 +0000
Subject: [PATCH 09/18] Move input.txt

---
 scripts/bert/{ => conversion_tools}/input.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/bert/{ => conversion_tools}/input.txt (100%)

diff --git a/scripts/bert/input.txt b/scripts/bert/conversion_tools/input.txt
similarity index 100%
rename from scripts/bert/input.txt
rename to scripts/bert/conversion_tools/input.txt

From 59b225060503519ba9acef072a7a38380322bfdd Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 7 Jun 2019 07:36:08 +0000
Subject: [PATCH 10/18] Fix vocabulary mismatch in compare_tf_gluon_model.py

---
 .../compare_tf_gluon_model.py                 | 31 +++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/scripts/bert/conversion_tools/compare_tf_gluon_model.py b/scripts/bert/conversion_tools/compare_tf_gluon_model.py
index b695aa81cd..11c3609254 100644
--- a/scripts/bert/conversion_tools/compare_tf_gluon_model.py
+++ b/scripts/bert/conversion_tools/compare_tf_gluon_model.py
@@ -28,7 +28,6 @@
 import gluonnlp as nlp
 
 sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-from utils import tf_vocab_to_gluon_vocab
 
 parser = argparse.ArgumentParser(description='Comparison script for BERT model in Tensorflow'
                                              'and that in Gluon. This script works with '
@@ -57,6 +56,8 @@
                     help='gluon model name')
 parser.add_argument('--gluon_parameter_file', type=str, default=None,
                     help='gluon parameter file name.')
+parser.add_argument('--gluon_vocab_file', type=str, default=None,
+                    help='gluon vocab file corresponding to --gluon_parameter_file.')
 
 args = parser.parse_args()
 
@@ -137,15 +138,18 @@
 #                               Gluon MODEL                                   #
 ###############################################################################
 
-vocabulary = tf_vocab_to_gluon_vocab(tokenizer.vocab)
-bert, vocabulary = nlp.model.get_model(args.gluon_model,
-                                       dataset_name=None,
-                                       vocab=vocabulary,
-                                       pretrained=not args.gluon_parameter_file,
-                                       use_pooler=False,
-                                       use_decoder=False,
-                                       use_classifier=False)
 if args.gluon_parameter_file:
+    assert args.gluon_vocab_file, \
+        'Must specify --gluon_vocab_file when specifying --gluon_parameter_file'
+    with open(args.gluon_vocab_file, 'r') as f:
+        vocabulary = nlp.Vocab.from_json(f.read())
+    bert, vocabulary = nlp.model.get_model(args.gluon_model,
+                                           dataset_name=None,
+                                           vocab=vocabulary,
+                                           pretrained=not args.gluon_parameter_file,
+                                           use_pooler=False,
+                                           use_decoder=False,
+                                           use_classifier=False)
     try:
         bert.cast('float16')
         bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
@@ -153,6 +157,15 @@
     except AssertionError:
         bert.cast('float32')
         bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
+else:
+    assert not args.gluon_vocab_file, \
+        'Cannot specify --gluon_vocab_file without specifying --gluon_parameter_file'
+    bert, vocabulary = nlp.model.get_model(args.gluon_model,
+                                           dataset_name=args.gluon_dataset,
+                                           pretrained=not args.gluon_parameter_file,
+                                           use_pooler=False,
+                                           use_decoder=False,
+                                           use_classifier=False)
 
 print(bert)
 tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case)

From 8cb5fc0b34f8835b084656f02a72e826c58c930b Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 7 Jun 2019 08:44:40 +0000
Subject: [PATCH 11/18] Add SciBert

---
 src/gluonnlp/data/utils.py    |  6 ++++-
 src/gluonnlp/model/bert.py    | 43 +++++++++++++++++++++++++++--------
 tests/unittest/test_models.py | 18 +++++++++++----
 3 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/src/gluonnlp/data/utils.py b/src/gluonnlp/data/utils.py
index a435e5adcc..a7649f9bb1 100644
--- a/src/gluonnlp/data/utils.py
+++ b/src/gluonnlp/data/utils.py
@@ -227,7 +227,11 @@ def _slice_pad_length(num_items, length, overlap=0):
                'wiki_cn_cased': 'ddebd8f3867bca5a61023f73326fb125cf12b4f5',
                'wiki_cn': 'ddebd8f3867bca5a61023f73326fb125cf12b4f5',
                'wiki_multilingual_uncased': '2b2514cc539047b9179e9d98a4e68c36db05c97a',
-               'wiki_multilingual': '2b2514cc539047b9179e9d98a4e68c36db05c97a'}
+               'wiki_multilingual': '2b2514cc539047b9179e9d98a4e68c36db05c97a',
+               'scibert_scivocab_uncased': '2d2566bfc416790ab2646ab0ada36ba628628d60',
+               'scibert_scivocab_cased': '2c714475b521ab8542cb65e46259f6bfeed8041b',
+               'scibert_basevocab_uncased': '80ef760a6bdafec68c99b691c94ebbb918c90d02',
+               'scibert_basevocab_cased': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c'}
 
 
 _url_format = '{repo_url}gluon/dataset/vocab/{file_name}.zip'
diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index 58c568322e..792b0ded8c 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -489,7 +489,11 @@ def _decode(self, sequence, masked_positions):
         ('885ebb9adc249a170c5576e90e88cfd1bbd98da6', 'bert_12_768_12_wiki_cn'),
         ('885ebb9adc249a170c5576e90e88cfd1bbd98da6', 'bert_12_768_12_wiki_cn_cased'),
         ('4e685a966f8bf07d533bd6b0e06c04136f23f620', 'bert_24_1024_16_book_corpus_wiki_en_cased'),
-        ('24551e1446180e045019a87fc4ffbf714d99c0b5', 'bert_24_1024_16_book_corpus_wiki_en_uncased')
+        ('24551e1446180e045019a87fc4ffbf714d99c0b5', 'bert_24_1024_16_book_corpus_wiki_en_uncased'),
+        ('6c82d963fc8fa79c35dd6cb3e1725d1e5b6aa7d7', 'bert_12_768_12_scibert_scivocab_uncased'),
+        ('adf9c81e72ac286a37b9002da8df9e50a753d98b', 'bert_12_768_12_scibert_scivocab_cased'),
+        ('75acea8e8386890120533d6c0032b0b3fcb2d536', 'bert_12_768_12_scibert_basevocab_uncased'),
+        ('8e86e5de55d6dae99123312cd8cdd8183a75e057', 'bert_12_768_12_scibert_basevocab_cased'),
     ]})
 
 bert_12_768_12_hparams = {
@@ -541,10 +545,17 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
     Parameters
     ----------
     dataset_name : str or None, default None
-        Options include 'book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased',
-        'wiki_cn_cased', 'wiki_multilingual_uncased' and 'wiki_multilingual_cased'.
+        If not None, the dataset name is used to load a vocabulary for the
+        dataset. If the `pretrained` argument is set to True, the dataset name
+        is further used to select the pretrained parameters to load.
+        The supported datasets are 'book_corpus_wiki_en_cased',
+        'book_corpus_wiki_en_uncased', 'wiki_cn_cased',
+        'wiki_multilingual_uncased', 'wiki_multilingual_cased',
+        'scibert_scivocab_uncased', 'scibert_scivocab_cased',
+        'scibert_basevocab_uncased','scibert_basevocab_cased'.
     vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset is not specified.
+        Vocabulary for the dataset. Must be provided if dataset_name is not
+        specified. Ignored if dataset_name is specified.
     pretrained : bool, default True
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
@@ -582,9 +593,13 @@ def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu()
     Parameters
     ----------
     dataset_name : str or None, default None
+        If not None, the dataset name is used to load a vocabulary for the
+        dataset. If the `pretrained` argument is set to True, the dataset name
+        is further used to select the pretrained parameters to load.
         Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'.
     vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset is not specified.
+        Vocabulary for the dataset. Must be provided if dataset_name is not
+        specified. Ignored if dataset_name is specified.
     pretrained : bool, default True
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
@@ -624,12 +639,20 @@ def get_bert_model(model_name=None, dataset_name=None, vocab=None,
     model_name : str or None, default None
         Options include 'bert_24_1024_16' and 'bert_12_768_12'.
     dataset_name : str or None, default None
-        Options include 'book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased'
-        for both bert_24_1024_16 and bert_12_768_12.
-        'wiki_cn_cased', 'wiki_multilingual_uncased' and 'wiki_multilingual_cased'
-        for bert_12_768_12 only.
+        If not None, the dataset name is used to load a vocabulary for the
+        dataset. If the `pretrained` argument is set to True, the dataset name
+        is further used to select the pretrained parameters to load.
+        The supported datasets for model_name of either bert_24_1024_16 and
+        bert_12_768_12 are 'book_corpus_wiki_en_cased',
+        'book_corpus_wiki_en_uncased'.
+        For model_name bert_12_768_12 'wiki_cn_cased',
+        'wiki_multilingual_uncased', 'wiki_multilingual_cased',
+        'scibert_scivocab_uncased', 'scibert_scivocab_cased',
+        'scibert_basevocab_uncased','scibert_basevocab_cased' are additionally
+        supported.
     vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset is not specified.
+        Vocabulary for the dataset. Must be provided if dataset_name is not
+        specified. Ignored if dataset_name is specified.
     pretrained : bool, default True
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
diff --git a/tests/unittest/test_models.py b/tests/unittest/test_models.py
index 0b290cf96c..90edc87146 100644
--- a/tests/unittest/test_models.py
+++ b/tests/unittest/test_models.py
@@ -101,15 +101,23 @@ def test_transformer_models():
 def test_pretrained_bert_models():
     models = ['bert_12_768_12', 'bert_24_1024_16']
     pretrained = {
-        'bert_12_768_12':
-        ['book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased',
-         'wiki_multilingual_uncased', 'wiki_multilingual_cased', 'wiki_cn_cased'],
-        'bert_24_1024_16': ['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased']}
+        'bert_12_768_12': [
+            'book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased', 'wiki_multilingual_uncased',
+            'wiki_multilingual_cased', 'wiki_cn_cased',
+            'scibert_scivocab_uncased', 'scibert_scivocab_cased', 'scibert_basevocab_uncased',
+            'scibert_basevocab_cased'
+        ],
+        'bert_24_1024_16': ['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased']
+    }
     vocab_size = {'book_corpus_wiki_en_cased': 28996,
                   'book_corpus_wiki_en_uncased': 30522,
                   'wiki_multilingual_cased': 119547,
                   'wiki_cn_cased': 21128,
-                  'wiki_multilingual_uncased': 105879}
+                  'wiki_multilingual_uncased': 105879,
+                  'scibert_scivocab_uncased': 31090,
+                  'scibert_scivocab_cased': 31116,
+                  'scibert_basevocab_uncased': 30522,
+                  'scibert_basevocab_cased': 28996}
     special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]']
     ones = mx.nd.ones((2, 10))
     valid_length = mx.nd.ones((2,))

From 0d0fec74002ef041d755fcbf7bb33c99dc6079cb Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 7 Jun 2019 10:10:33 +0000
Subject: [PATCH 12/18] Add BioBert

---
 .../bert/conversion_tools/convert_tf_model.py |  42 +++++--
 src/gluonnlp/data/utils.py                    |   6 +-
 src/gluonnlp/model/bert.py                    | 105 +++++++++++++-----
 src/gluonnlp/model/utils.py                   |   5 +-
 tests/unittest/test_models.py                 |  60 ++++++++--
 5 files changed, 169 insertions(+), 49 deletions(-)

diff --git a/scripts/bert/conversion_tools/convert_tf_model.py b/scripts/bert/conversion_tools/convert_tf_model.py
index bd0b540f9e..8eca89c5b6 100644
--- a/scripts/bert/conversion_tools/convert_tf_model.py
+++ b/scripts/bert/conversion_tools/convert_tf_model.py
@@ -47,6 +47,9 @@
 parser.add_argument('--tf_checkpoint_dir',
                     type=str,
                     help='Path to Tensorflow checkpoint folder.')
+parser.add_argument('--tf_model_prefix', type=str,
+                    default='bert_model.ckpt',
+                    help='name of bert checkpoint file.')
 parser.add_argument('--tf_config_name', type=str,
                     default='bert_config.json',
                     help='Name of Bert config file')
@@ -75,7 +78,7 @@
 
 # load tf model
 tf_checkpoint_file = os.path.expanduser(
-    os.path.join(args.tf_checkpoint_dir, 'bert_model.ckpt'))
+    os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix))
 logging.info('loading Tensorflow checkpoint %s ...', tf_checkpoint_file)
 tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
 tf_names = sorted(tf_tensors.keys())
@@ -134,14 +137,6 @@
         logging.info('warning: %s has symmetric shape %s', target_name, target.shape)
     logging.debug('%s: %s', target_name, target.shape)
 
-# post processings for parameters:
-# - handle tied decoder weight
-mx_tensors['decoder.3.weight'] = mx_tensors['word_embed.0.weight']
-logging.info('total number of tf parameters = %d', len(tf_names))
-logging.info(
-    'total number of mx parameters = %d (including decoder param for weight tying)',
-    len(mx_tensors))
-
 # BERT config
 tf_config_names_to_gluon_config_names = {
     'attention_probs_dropout_prob': 'embed_dropout',
@@ -176,6 +171,26 @@
                       dropout=predefined_args['dropout'],
                       use_residual=predefined_args['use_residual'])
 
+# Infer enabled BERTModel components
+use_pooler = any('pooler' in n for n in mx_tensors)
+use_decoder = any('decoder.0' in n for n in mx_tensors)
+use_classifier = any('classifier.weight' in n for n in mx_tensors)
+
+logging.info('Inferred that the tensorflow model provides the following parameters:')
+logging.info('- use_pooler = {}'.format(use_pooler))
+logging.info('- use_decoder = {}'.format(use_decoder))
+logging.info('- use_classifier = {}'.format(use_classifier))
+
+# post processings for parameters:
+# - handle tied decoder weight
+logging.info('total number of tf parameters = %d', len(tf_names))
+if use_decoder:
+    mx_tensors['decoder.3.weight'] = mx_tensors['word_embed.0.weight']
+    logging.info('total number of mx parameters = %d'
+                 '(including decoder param for weight tying)', len(mx_tensors))
+else:
+    logging.info('total number of mx parameters = %d', len(mx_tensors))
+
 # BERT model
 bert = BERTModel(encoder, len(vocab),
                  token_type_vocab_size=predefined_args['token_type_vocab_size'],
@@ -183,14 +198,19 @@
                  embed_size=predefined_args['embed_size'],
                  embed_dropout=predefined_args['embed_dropout'],
                  word_embed=predefined_args['word_embed'],
-                 use_pooler=True, use_decoder=True,
-                 use_classifier=True)
+                 use_pooler=use_pooler, use_decoder=use_decoder,
+                 use_classifier=use_classifier)
 
 bert.initialize(init=mx.init.Normal(0.02))
 
 ones = mx.nd.ones((2, 8))
 out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
 params = bert._collect_params_with_prefix()
+if len(params) != len(mx_tensors):
+    raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
+                       'but {} have been extracted from the tf model. '
+                       'Most likely the BERTModel hyperparameters do not match '
+                       'the hyperparameters of the tf model.'.format(len(params), len(mx_tensors)))
 
 # set parameter data
 loaded_params = {}
diff --git a/src/gluonnlp/data/utils.py b/src/gluonnlp/data/utils.py
index a7649f9bb1..62334b0c9b 100644
--- a/src/gluonnlp/data/utils.py
+++ b/src/gluonnlp/data/utils.py
@@ -231,7 +231,11 @@ def _slice_pad_length(num_items, length, overlap=0):
                'scibert_scivocab_uncased': '2d2566bfc416790ab2646ab0ada36ba628628d60',
                'scibert_scivocab_cased': '2c714475b521ab8542cb65e46259f6bfeed8041b',
                'scibert_basevocab_uncased': '80ef760a6bdafec68c99b691c94ebbb918c90d02',
-               'scibert_basevocab_cased': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c'}
+               'scibert_basevocab_cased': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'biobert_v1.0_pmc': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'biobert_v1.0_pubmed': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'biobert_v1.0_pubmed_pmc': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'biobert_v1.1_pubmed': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c'}
 
 
 _url_format = '{repo_url}gluon/dataset/vocab/{file_name}.zip'
diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index 792b0ded8c..a503480f48 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -494,6 +494,10 @@ def _decode(self, sequence, masked_positions):
         ('adf9c81e72ac286a37b9002da8df9e50a753d98b', 'bert_12_768_12_scibert_scivocab_cased'),
         ('75acea8e8386890120533d6c0032b0b3fcb2d536', 'bert_12_768_12_scibert_basevocab_uncased'),
         ('8e86e5de55d6dae99123312cd8cdd8183a75e057', 'bert_12_768_12_scibert_basevocab_cased'),
+        ('a07780385add682f609772e81ec64aca77c9fb05', 'bert_12_768_12_biobert_v1.0_pmc'),
+        ('280ad1cc487db90489f86189e045e915b35e7489', 'bert_12_768_12_biobert_v1.0_pubmed'),
+        ('8a8c75441f028a6b928b11466f3d30f4360dfff5', 'bert_12_768_12_biobert_v1.0_pubmed_pmc'),
+        ('55f15c5d23829f6ee87622b68711b15fef50e55b', 'bert_12_768_12_biobert_v1.1_pubmed'),
     ]})
 
 bert_12_768_12_hparams = {
@@ -535,8 +539,8 @@ def _decode(self, sequence, masked_positions):
 
 
 def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                   root=os.path.join(get_home_dir(), 'models'), use_pooler=True,
-                   use_decoder=True, use_classifier=True, **kwargs):
+                   root=os.path.join(get_home_dir(), 'models'), use_pooler=True, use_decoder=True,
+                   use_classifier=True, pretrained_allow_missing=False, **kwargs):
     """Generic BERT BASE model.
 
     The number of layers (L) is 12, number of units (H) is 768, and the
@@ -552,7 +556,9 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
         'book_corpus_wiki_en_uncased', 'wiki_cn_cased',
         'wiki_multilingual_uncased', 'wiki_multilingual_cased',
         'scibert_scivocab_uncased', 'scibert_scivocab_cased',
-        'scibert_basevocab_uncased','scibert_basevocab_cased'.
+        'scibert_basevocab_uncased','scibert_basevocab_cased',
+        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
+        'biobert_v1.1_pubmed'
     vocab : gluonnlp.vocab.BERTVocab or None, default None
         Vocabulary for the dataset. Must be provided if dataset_name is not
         specified. Ignored if dataset_name is specified.
@@ -569,22 +575,41 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
         for for segment level classification task.
     use_decoder : bool, default True
         Whether to include the decoder for masked language model prediction.
+        Note that
+        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
+        'biobert_v1.1_pubmed'
+        do not include these parameters.
     use_classifier : bool, default True
         Whether to include the classifier for next sentence classification.
+        Note that
+        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
+        'biobert_v1.1_pubmed'
+        do not include these parameters.
+    pretrained_allow_missing : bool, default False
+        Whether to ignore if any parameters for the BERTModel are missing in
+        the pretrained weights for model.
+        Some BERTModels for example do not provide decoder or classifier
+        weights. In that case it is still possible to construct a BERTModel
+        with use_decoder=True and/or use_classifier=True, but the respective
+        parameters will be missing from the pretrained file.
+        If pretrained_allow_missing=True, this will be ignored and the
+        parameters will be left uninitialized. Otherwise AssertionError is
+        raised.
 
     Returns
     -------
     BERTModel, gluonnlp.vocab.BERTVocab
     """
-    return get_bert_model(model_name='bert_12_768_12', vocab=vocab,
-                          dataset_name=dataset_name, pretrained=pretrained, ctx=ctx,
-                          use_pooler=use_pooler, use_decoder=use_decoder,
-                          use_classifier=use_classifier, root=root, **kwargs)
+    return get_bert_model(model_name='bert_12_768_12', vocab=vocab, dataset_name=dataset_name,
+                          pretrained=pretrained, ctx=ctx, use_pooler=use_pooler,
+                          use_decoder=use_decoder, use_classifier=use_classifier, root=root,
+                          pretrained_allow_missing=pretrained_allow_missing, **kwargs)
 
 
-def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                    use_pooler=True, use_decoder=True, use_classifier=True,
-                    root=os.path.join(get_home_dir(), 'models'), **kwargs):
+def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), use_pooler=True,
+                    use_decoder=True, use_classifier=True,
+                    root=os.path.join(get_home_dir(), 'models'),
+                    pretrained_allow_missing=False, **kwargs):
     """Generic BERT LARGE model.
 
     The number of layers (L) is 24, number of units (H) is 1024, and the
@@ -615,23 +640,31 @@ def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu()
         Whether to include the decoder for masked language model prediction.
     use_classifier : bool, default True
         Whether to include the classifier for next sentence classification.
+    pretrained_allow_missing : bool, default False
+        Whether to ignore if any parameters for the BERTModel are missing in
+        the pretrained weights for model.
+        Some BERTModels for example do not provide decoder or classifier
+        weights. In that case it is still possible to construct a BERTModel
+        with use_decoder=True and/or use_classifier=True, but the respective
+        parameters will be missing from the pretrained file.
+        If pretrained_allow_missing=True, this will be ignored and the
+        parameters will be left uninitialized. Otherwise AssertionError is
+        raised.
 
     Returns
     -------
     BERTModel, gluonnlp.vocab.BERTVocab
     """
-    return get_bert_model(model_name='bert_24_1024_16', vocab=vocab,
-                          dataset_name=dataset_name, pretrained=pretrained,
-                          ctx=ctx, use_pooler=use_pooler,
-                          use_decoder=use_decoder, use_classifier=use_classifier,
-                          root=root, **kwargs)
-
-
-def get_bert_model(model_name=None, dataset_name=None, vocab=None,
-                   pretrained=True, ctx=mx.cpu(),
-                   use_pooler=True, use_decoder=True, use_classifier=True,
-                   output_attention=False, output_all_encodings=False,
-                   root=os.path.join(get_home_dir(), 'models'), **kwargs):
+    return get_bert_model(model_name='bert_24_1024_16', vocab=vocab, dataset_name=dataset_name,
+                          pretrained=pretrained, ctx=ctx, use_pooler=use_pooler,
+                          use_decoder=use_decoder, use_classifier=use_classifier, root=root,
+                          pretrained_allow_missing=pretrained_allow_missing, **kwargs)
+
+
+def get_bert_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
+                   use_pooler=True, use_decoder=True, use_classifier=True, output_attention=False,
+                   output_all_encodings=False, root=os.path.join(get_home_dir(), 'models'),
+                   pretrained_allow_missing=False, **kwargs):
     """Any BERT pretrained model.
 
     Parameters
@@ -648,8 +681,10 @@ def get_bert_model(model_name=None, dataset_name=None, vocab=None,
         For model_name bert_12_768_12 'wiki_cn_cased',
         'wiki_multilingual_uncased', 'wiki_multilingual_cased',
         'scibert_scivocab_uncased', 'scibert_scivocab_cased',
-        'scibert_basevocab_uncased','scibert_basevocab_cased' are additionally
-        supported.
+        'scibert_basevocab_uncased','scibert_basevocab_cased',
+        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
+        'biobert_v1.1_pubmed'
+         are additionally supported.
     vocab : gluonnlp.vocab.BERTVocab or None, default None
         Vocabulary for the dataset. Must be provided if dataset_name is not
         specified. Ignored if dataset_name is specified.
@@ -666,12 +701,30 @@ def get_bert_model(model_name=None, dataset_name=None, vocab=None,
         for for segment level classification task.
     use_decoder : bool, default True
         Whether to include the decoder for masked language model prediction.
+        Note that
+        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
+        'biobert_v1.1_pubmed'
+        do not include these parameters.
     use_classifier : bool, default True
         Whether to include the classifier for next sentence classification.
+        Note that
+        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
+        'biobert_v1.1_pubmed'
+        do not include these parameters.
     output_attention : bool, default False
         Whether to include attention weights of each encoding cell to the output.
     output_all_encodings : bool, default False
         Whether to output encodings of all encoder cells.
+    pretrained_allow_missing : bool, default False
+        Whether to ignore if any parameters for the BERTModel are missing in
+        the pretrained weights for model.
+        Some BERTModels for example do not provide decoder or classifier
+        weights. In that case it is still possible to construct a BERTModel
+        with use_decoder=True and/or use_classifier=True, but the respective
+        parameters will be missing from the pretrained file.
+        If pretrained_allow_missing=True, this will be ignored and the
+        parameters will be left uninitialized. Otherwise AssertionError is
+        raised.
 
     Returns
     -------
@@ -712,6 +765,6 @@ def get_bert_model(model_name=None, dataset_name=None, vocab=None,
                     use_classifier=use_classifier)
     if pretrained:
         ignore_extra = not (use_pooler and use_decoder and use_classifier)
-        _load_pretrained_params(net, model_name, dataset_name, root, ctx,
-                                ignore_extra=ignore_extra)
+        _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=ignore_extra,
+                                allow_missing=pretrained_allow_missing)
     return net, bert_vocab
diff --git a/src/gluonnlp/model/utils.py b/src/gluonnlp/model/utils.py
index b6b4359cf6..3b4b0c7739 100644
--- a/src/gluonnlp/model/utils.py
+++ b/src/gluonnlp/model/utils.py
@@ -274,7 +274,8 @@ def _load_vocab(dataset_name, vocab, root, cls=None):
     return vocab
 
 
-def _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=False):
+def _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=False,
+                            allow_missing=False):
     path = '_'.join([model_name, dataset_name])
     model_file = model_store.get_model_file(path, root=root)
-    net.load_parameters(model_file, ctx=ctx, ignore_extra=ignore_extra)
+    net.load_parameters(model_file, ctx=ctx, ignore_extra=ignore_extra, allow_missing=allow_missing)
diff --git a/tests/unittest/test_models.py b/tests/unittest/test_models.py
index 90edc87146..2da1fa0cae 100644
--- a/tests/unittest/test_models.py
+++ b/tests/unittest/test_models.py
@@ -98,14 +98,16 @@ def test_transformer_models():
 
 @pytest.mark.serial
 @pytest.mark.remote_required
-def test_pretrained_bert_models():
+@pytest.mark.parametrize('disable_missing_parameters', [False, True])
+def test_pretrained_bert_models(disable_missing_parameters):
     models = ['bert_12_768_12', 'bert_24_1024_16']
     pretrained = {
         'bert_12_768_12': [
             'book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased', 'wiki_multilingual_uncased',
             'wiki_multilingual_cased', 'wiki_cn_cased',
             'scibert_scivocab_uncased', 'scibert_scivocab_cased', 'scibert_basevocab_uncased',
-            'scibert_basevocab_cased'
+            'scibert_basevocab_cased', 'biobert_v1.0_pmc', 'biobert_v1.0_pubmed',
+            'biobert_v1.0_pubmed_pmc', 'biobert_v1.1_pubmed'
         ],
         'bert_24_1024_16': ['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased']
     }
@@ -117,18 +119,51 @@ def test_pretrained_bert_models():
                   'scibert_scivocab_uncased': 31090,
                   'scibert_scivocab_cased': 31116,
                   'scibert_basevocab_uncased': 30522,
-                  'scibert_basevocab_cased': 28996}
+                  'scibert_basevocab_cased': 28996,
+                  'biobert_v1.0_pubmed': 28996,
+                  'biobert_v1.0_pmc': 28996,
+                  'biobert_v1.0_pubmed_pmc': 28996,
+                  'biobert_v1.1_pubmed': 28996}
     special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]']
     ones = mx.nd.ones((2, 10))
     valid_length = mx.nd.ones((2,))
     positions = mx.nd.zeros((2, 3))
     for model_name in models:
-        eprint('testing forward for %s' % model_name)
         pretrained_datasets = pretrained.get(model_name)
         for dataset in pretrained_datasets:
-            model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                               pretrained=True,
-                                               root='tests/data/model/')
+            has_missing_params = 'biobert' in dataset
+            if not has_missing_params and disable_missing_parameters:
+                # No parameters to disable for models pretrained on this dataset
+                continue
+
+            eprint('testing forward for %s on %s' % (model_name, dataset))
+
+            if not has_missing_params:
+                model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
+                                                   pretrained=True,
+                                                   root='tests/data/model/')
+            else:
+                with pytest.raises(AssertionError):
+                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
+                                                       pretrained=True,
+                                                       root='tests/data/model/')
+
+                if not disable_missing_parameters:
+                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
+                                                       pretrained=True,
+                                                       root='tests/data/model/',
+                                                       pretrained_allow_missing=True)
+                else:
+                    # Biobert specific test case; needs to be adapted in case
+                    # of other datasets with missing parameters
+                    assert 'biobert' in dataset
+                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
+                                                       pretrained=True,
+                                                       root='tests/data/model/',
+                                                       pretrained_allow_missing=True,
+                                                       use_decoder=False,
+                                                       use_classifier=False)
+
             assert len(vocab) == vocab_size[dataset]
             for token in special_tokens:
                 assert token in vocab, "Token %s not found in the vocab" % token
@@ -137,8 +172,14 @@ def test_pretrained_bert_models():
             assert vocab.unknown_token == '[UNK]'
             assert vocab.bos_token is None
             assert vocab.eos_token is None
-            output = model(ones, ones, valid_length, positions)
-            output[0].wait_to_read()
+
+            if has_missing_params and not disable_missing_parameters:
+                with pytest.raises(RuntimeError):
+                    output = model(ones, ones, valid_length, positions)
+                    output[0].wait_to_read()
+            else:
+                output = model(ones, ones, valid_length, positions)
+                output[0].wait_to_read()
             del model
             mx.nd.waitall()
 
@@ -490,6 +531,7 @@ def forward(self, inpt):
         for name, param in shared_net.collect_params().items():
             assert not mx.test_utils.almost_equal(grads[name].asnumpy(), param.grad().asnumpy())
 
+
 def test_gelu():
     x = mx.random.uniform(shape=(3, 4, 5))
     net = nlp.model.GELU()

From d724106ddff3225437260f0d21043f98587524a7 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 7 Jun 2019 14:30:33 +0000
Subject: [PATCH 13/18] Add PyTorch Bert support

Confirmed that the scibert_scivocab_uncased loaded from PyTorch produces the
same output as the tensorflow version (based on the compare_tf_gluon_model.py).
---
 .../conversion_tools/convert_pytorch_model.py | 182 ++++++++++++++++++
 ...er_pytorch_gluon_parameter_name_mapping.py |  96 +++++++++
 src/gluonnlp/data/utils.py                    |   3 +-
 src/gluonnlp/model/bert.py                    |  13 +-
 tests/unittest/test_models.py                 |   5 +-
 5 files changed, 292 insertions(+), 7 deletions(-)
 create mode 100644 scripts/bert/conversion_tools/convert_pytorch_model.py
 create mode 100644 scripts/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py

diff --git a/scripts/bert/conversion_tools/convert_pytorch_model.py b/scripts/bert/conversion_tools/convert_pytorch_model.py
new file mode 100644
index 0000000000..df9bcddb04
--- /dev/null
+++ b/scripts/bert/conversion_tools/convert_pytorch_model.py
@@ -0,0 +1,182 @@
+# coding: utf-8
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# 'License'); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+""" Script for converting PyTorch Model to Gluon. """
+
+import argparse
+import json
+import logging
+import os
+import sys
+
+import mxnet as mx
+import gluonnlp as nlp
+import torch
+from gluonnlp.model import BERTEncoder, BERTModel
+from gluonnlp.model.bert import bert_hparams
+
+sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
+from utils import get_hash, load_tf_vocab, tf_vocab_to_gluon_vocab
+
+parser = argparse.ArgumentParser(description='Conversion script for PyTorch BERT model',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--model', type=str, default='bert_12_768_12',
+                    choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name')
+parser.add_argument('--pytorch_checkpoint_dir', type=str,
+                    help='Path to Tensorflow checkpoint folder.')
+parser.add_argument('--vocab_file', type=str, help='Full path to the vocab.txt')
+parser.add_argument('--gluon_pytorch_name_mapping', type=str,
+                    default='gluon_to_pytorch_naming.json',
+                    help='Output of infer_pytorch_gluon_parameter_name_mapping.py')
+parser.add_argument('--out_dir', type=str, default=os.path.join('~', 'output'),
+                    help='Path to output folder. The folder must exist.')
+parser.add_argument('--debug', action='store_true', help='debugging mode')
+args = parser.parse_args()
+logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
+logging.info(args)
+
+# convert vocabulary
+vocab = tf_vocab_to_gluon_vocab(load_tf_vocab(args.vocab_file))
+
+# vocab serialization
+tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
+with open(tmp_file_path, 'w') as f:
+    f.write(vocab.to_json())
+hash_full, hash_short = get_hash(tmp_file_path)
+gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab'))
+with open(gluon_vocab_path, 'w') as f:
+    f.write(vocab.to_json())
+    logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
+
+# Load PyTorch Model
+pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
+                                map_location=lambda storage, loc: storage)
+pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
+
+# Make sure vocab fits to model
+assert pytorch_parameters['bert.embeddings.word_embeddings.weight'].shape[0] == len(
+    vocab.idx_to_token)
+
+# Load Mapping
+with open(args.gluon_pytorch_name_mapping, 'r') as f:
+    mapping = json.load(f)
+
+# BERT config
+tf_config_names_to_gluon_config_names = {
+    'attention_probs_dropout_prob': 'embed_dropout',
+    'hidden_act': None,
+    'hidden_dropout_prob': 'dropout',
+    'hidden_size': 'units',
+    'initializer_range': None,
+    'intermediate_size': 'hidden_size',
+    'max_position_embeddings': 'max_length',
+    'num_attention_heads': 'num_heads',
+    'num_hidden_layers': 'num_layers',
+    'type_vocab_size': 'token_type_vocab_size',
+    'vocab_size': None
+}
+predefined_args = bert_hparams[args.model]
+with open(os.path.join(args.pytorch_checkpoint_dir, 'bert_config.json'), 'r') as f:
+    tf_config = json.load(f)
+    assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
+    for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
+        if tf_name is None or gluon_name is None:
+            continue
+        assert tf_config[tf_name] == predefined_args[gluon_name]
+
+# BERT encoder
+encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
+                      num_layers=predefined_args['num_layers'], units=predefined_args['units'],
+                      hidden_size=predefined_args['hidden_size'],
+                      max_length=predefined_args['max_length'],
+                      num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
+                      dropout=predefined_args['dropout'],
+                      use_residual=predefined_args['use_residual'])
+
+# Infer enabled BERTModel components
+use_pooler = any('pooler' in n for n in pytorch_parameters)
+use_decoder = any('cls.predictions.transform.dense.weight' in n for n in pytorch_parameters)
+use_classifier = any('cls.seq_relationship.weight' in n for n in pytorch_parameters)
+
+if not use_classifier and 'classifier.weight' in pytorch_parameters and \
+   pytorch_parameters['classifier.weight'].shape[0] == 2:
+    logging.info('Assuming classifier weights in provided Pytorch model are '
+                 'from next sentence prediction task.')
+    use_classifier = True
+
+logging.info('Inferred that the pytorch model provides the following parameters:')
+logging.info('- use_pooler = {}'.format(use_pooler))
+logging.info('- use_decoder = {}'.format(use_decoder))
+logging.info('- use_classifier = {}'.format(use_classifier))
+
+# BERT model
+bert = BERTModel(encoder, len(vocab),
+                 token_type_vocab_size=predefined_args['token_type_vocab_size'],
+                 units=predefined_args['units'], embed_size=predefined_args['embed_size'],
+                 embed_dropout=predefined_args['embed_dropout'],
+                 word_embed=predefined_args['word_embed'], use_pooler=use_pooler,
+                 use_decoder=use_decoder, use_classifier=use_classifier)
+
+bert.initialize(init=mx.init.Normal(0.02))
+
+ones = mx.nd.ones((2, 8))
+out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
+params = bert._collect_params_with_prefix()
+assert len(params) == len(pytorch_parameters), "Gluon model does not match PyTorch model. " \
+    "Please fix the BERTModel hyperparameters"
+
+# set parameter data
+loaded_params = {}
+for name in params:
+    if name not in mapping:
+        raise RuntimeError('Invalid json mapping file. '
+                           'The parameter {} is not described in the mapping file.'.format(name))
+    pytorch_name = mapping[name]
+    if pytorch_name not in pytorch_parameters.keys():
+        # Handle inconsistent naming in PyTorch
+        # The Expected names here are based on the PyTorch version of SciBert.
+        # The Inconsistencies were found in ClinicalBert
+        if 'LayerNorm' in pytorch_name:
+            pytorch_name = pytorch_name.replace('weight', 'gamma')
+            pytorch_name = pytorch_name.replace('bias', 'beta')
+            assert pytorch_name in pytorch_parameters.keys()
+
+        if 'cls.seq_relationship' in pytorch_name:
+            pytorch_name = pytorch_name.replace('cls.seq_relationship', 'classifier')
+
+    arr = mx.nd.array(pytorch_parameters[pytorch_name])
+
+    if arr.shape != params[name].shape:
+        breakpoint()
+
+    params[name].set_data(arr)
+    loaded_params[name] = True
+
+if len(params) != len(loaded_params):
+    raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
+                       'but {} have been extracted from the pytorch model. '.format(
+                           len(params), len(loaded_params)))
+
+# param serialization
+bert.save_parameters(tmp_file_path)
+hash_full, hash_short = get_hash(tmp_file_path)
+gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
+logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
+bert.save_parameters(gluon_param_path)
+mx.nd.waitall()
diff --git a/scripts/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py b/scripts/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
new file mode 100644
index 0000000000..ce9d11d7a0
--- /dev/null
+++ b/scripts/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# 'License'); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+"""PyTorch BERT parameter naming to Gluon BERT parameter naming.
+
+Given a Gluon BERT model (eg. obtained with the convert_tf_gluon.py script) and
+a pytorch_model.bin containing the same parameters, this script infers the
+naming convention of PyTorch.
+
+"""
+
+import argparse
+import json
+import logging
+import os
+import sys
+
+import gluonnlp as nlp
+import torch
+
+sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
+from utils import load_tf_vocab, tf_vocab_to_gluon_vocab
+
+parser = argparse.ArgumentParser(description='Pytorch BERT Naming Convention',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--model', type=str, default='bert_12_768_12',
+                    choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name')
+parser.add_argument('--dataset_name', type=str, default='scibert_scivocab_uncased',
+                    help='Dataset name')
+parser.add_argument('--pytorch_checkpoint_dir', type=str,
+                    help='Path to Tensorflow checkpoint folder.')
+parser.add_argument('--debug', action='store_true', help='debugging mode')
+parser.add_argument('--out', default='gluon_to_pytorch_naming.json',
+                    help='Output file to store gluon to pytorch name mapping.')
+args = parser.parse_args()
+logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
+logging.info(args)
+
+# Load Gluon Model
+# TODO root is only set until parameters are uploaded to S3
+bert, vocab = nlp.model.get_model(args.model, dataset_name=args.dataset_name, pretrained=True,
+                                  root='/home/ubuntu/projects/gluon-nlp/tests/data/model/')
+parameters = bert._collect_params_with_prefix()
+parameters = {k: v.data().asnumpy() for k, v in parameters.items()}
+
+# Load PyTorch Model
+pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
+                                map_location=lambda storage, loc: storage)
+pytorch_vocab = tf_vocab_to_gluon_vocab(
+    load_tf_vocab(os.path.join(args.pytorch_checkpoint_dir, 'vocab.txt')))
+pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
+
+# Assert that vocabularies are equal
+assert pytorch_vocab.idx_to_token == vocab.idx_to_token
+
+mapping = dict()
+
+for name, param in parameters.items():
+    found_match = False
+    for pytorch_name, pytorch_param in pytorch_parameters.items():
+        if param.shape == pytorch_param.shape:
+            if (param == pytorch_param).all():
+                if found_match:
+                    print('Found multiple matches for {}. '
+                          'Ignoring new match {}'.format(name, pytorch_name))
+                else:
+                    found_match = True
+                    mapping.update({name: pytorch_name})
+
+        # We don't break here, in case there are mulitple matches
+
+    if not found_match:
+        raise RuntimeError('Pytorch and Gluon model do not match. '
+                           'Cannot infer mapping of names.')
+
+assert len(mapping) == len(parameters)
+
+with open(args.out, 'w') as f:
+    json.dump(mapping, f, indent="  ")
+    print('Wrote mapping to {}'.format(args.out))
diff --git a/src/gluonnlp/data/utils.py b/src/gluonnlp/data/utils.py
index 62334b0c9b..c2753a1c67 100644
--- a/src/gluonnlp/data/utils.py
+++ b/src/gluonnlp/data/utils.py
@@ -235,7 +235,8 @@ def _slice_pad_length(num_items, length, overlap=0):
                'biobert_v1.0_pmc': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
                'biobert_v1.0_pubmed': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
                'biobert_v1.0_pubmed_pmc': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
-               'biobert_v1.1_pubmed': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c'}
+               'biobert_v1.1_pubmed': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'clinicalbert': '80ef760a6bdafec68c99b691c94ebbb918c90d02'}
 
 
 _url_format = '{repo_url}gluon/dataset/vocab/{file_name}.zip'
diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index a503480f48..faafad8569 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -498,6 +498,7 @@ def _decode(self, sequence, masked_positions):
         ('280ad1cc487db90489f86189e045e915b35e7489', 'bert_12_768_12_biobert_v1.0_pubmed'),
         ('8a8c75441f028a6b928b11466f3d30f4360dfff5', 'bert_12_768_12_biobert_v1.0_pubmed_pmc'),
         ('55f15c5d23829f6ee87622b68711b15fef50e55b', 'bert_12_768_12_biobert_v1.1_pubmed'),
+        ('60281c98ba3572dfdaac75131fa96e2136d70d5c', 'bert_12_768_12_clinicalbert'),
     ]})
 
 bert_12_768_12_hparams = {
@@ -558,7 +559,8 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
         'scibert_scivocab_uncased', 'scibert_scivocab_cased',
         'scibert_basevocab_uncased','scibert_basevocab_cased',
         'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed'
+        'biobert_v1.1_pubmed',
+        'clinicalbert'
     vocab : gluonnlp.vocab.BERTVocab or None, default None
         Vocabulary for the dataset. Must be provided if dataset_name is not
         specified. Ignored if dataset_name is specified.
@@ -577,7 +579,8 @@ def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
         Whether to include the decoder for masked language model prediction.
         Note that
         'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed'
+        'biobert_v1.1_pubmed',
+        'clinicalbert'
         do not include these parameters.
     use_classifier : bool, default True
         Whether to include the classifier for next sentence classification.
@@ -683,7 +686,8 @@ def get_bert_model(model_name=None, dataset_name=None, vocab=None, pretrained=Tr
         'scibert_scivocab_uncased', 'scibert_scivocab_cased',
         'scibert_basevocab_uncased','scibert_basevocab_cased',
         'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed'
+        'biobert_v1.1_pubmed',
+        'clinicalbert'
          are additionally supported.
     vocab : gluonnlp.vocab.BERTVocab or None, default None
         Vocabulary for the dataset. Must be provided if dataset_name is not
@@ -703,7 +707,8 @@ def get_bert_model(model_name=None, dataset_name=None, vocab=None, pretrained=Tr
         Whether to include the decoder for masked language model prediction.
         Note that
         'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed'
+        'biobert_v1.1_pubmed',
+        'clinicalbert'
         do not include these parameters.
     use_classifier : bool, default True
         Whether to include the classifier for next sentence classification.
diff --git a/tests/unittest/test_models.py b/tests/unittest/test_models.py
index 2da1fa0cae..39bc98469a 100644
--- a/tests/unittest/test_models.py
+++ b/tests/unittest/test_models.py
@@ -107,7 +107,7 @@ def test_pretrained_bert_models(disable_missing_parameters):
             'wiki_multilingual_cased', 'wiki_cn_cased',
             'scibert_scivocab_uncased', 'scibert_scivocab_cased', 'scibert_basevocab_uncased',
             'scibert_basevocab_cased', 'biobert_v1.0_pmc', 'biobert_v1.0_pubmed',
-            'biobert_v1.0_pubmed_pmc', 'biobert_v1.1_pubmed'
+            'biobert_v1.0_pubmed_pmc', 'biobert_v1.1_pubmed', 'clinicalbert'
         ],
         'bert_24_1024_16': ['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased']
     }
@@ -123,7 +123,8 @@ def test_pretrained_bert_models(disable_missing_parameters):
                   'biobert_v1.0_pubmed': 28996,
                   'biobert_v1.0_pmc': 28996,
                   'biobert_v1.0_pubmed_pmc': 28996,
-                  'biobert_v1.1_pubmed': 28996}
+                  'biobert_v1.1_pubmed': 28996,
+                  'clinicalbert': 30522}
     special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]']
     ones = mx.nd.ones((2, 10))
     valid_length = mx.nd.ones((2,))

From 5614e5b0fe386f08e307eec0fb47314061df345c Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 7 Jun 2019 19:46:47 +0000
Subject: [PATCH 14/18] Fix lint

---
 scripts/bert/conversion_tools/convert_pytorch_model.py | 4 +---
 src/gluonnlp/model/bert.py                             | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/bert/conversion_tools/convert_pytorch_model.py b/scripts/bert/conversion_tools/convert_pytorch_model.py
index df9bcddb04..6bfbfd866a 100644
--- a/scripts/bert/conversion_tools/convert_pytorch_model.py
+++ b/scripts/bert/conversion_tools/convert_pytorch_model.py
@@ -162,9 +162,7 @@
 
     arr = mx.nd.array(pytorch_parameters[pytorch_name])
 
-    if arr.shape != params[name].shape:
-        breakpoint()
-
+    assert arr.shape == params[name].shape
     params[name].set_data(arr)
     loaded_params[name] = True
 
diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index faafad8569..21dbb603b0 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -688,7 +688,7 @@ def get_bert_model(model_name=None, dataset_name=None, vocab=None, pretrained=Tr
         'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
         'biobert_v1.1_pubmed',
         'clinicalbert'
-         are additionally supported.
+        are additionally supported.
     vocab : gluonnlp.vocab.BERTVocab or None, default None
         Vocabulary for the dataset. Must be provided if dataset_name is not
         specified. Ignored if dataset_name is specified.

From cb8344b0a853357579d9706b4eab39e960d147c5 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Fri, 7 Jun 2019 20:40:50 +0000
Subject: [PATCH 15/18] Fix clinicalbert test case

---
 tests/unittest/test_models.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/unittest/test_models.py b/tests/unittest/test_models.py
index 39bc98469a..949805d0cf 100644
--- a/tests/unittest/test_models.py
+++ b/tests/unittest/test_models.py
@@ -132,7 +132,7 @@ def test_pretrained_bert_models(disable_missing_parameters):
     for model_name in models:
         pretrained_datasets = pretrained.get(model_name)
         for dataset in pretrained_datasets:
-            has_missing_params = 'biobert' in dataset
+            has_missing_params = any(n in dataset for n in ('biobert', 'clinicalbert'))
             if not has_missing_params and disable_missing_parameters:
                 # No parameters to disable for models pretrained on this dataset
                 continue
@@ -154,16 +154,23 @@ def test_pretrained_bert_models(disable_missing_parameters):
                                                        pretrained=True,
                                                        root='tests/data/model/',
                                                        pretrained_allow_missing=True)
-                else:
-                    # Biobert specific test case; needs to be adapted in case
-                    # of other datasets with missing parameters
-                    assert 'biobert' in dataset
+                elif 'biobert' in dataset:
+                    # Biobert specific test case
                     model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
                                                        pretrained=True,
                                                        root='tests/data/model/',
                                                        pretrained_allow_missing=True,
                                                        use_decoder=False,
                                                        use_classifier=False)
+                elif 'clinicalbert' in dataset:
+                    # Clinicalbert specific test case
+                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
+                                                       pretrained=True,
+                                                       root='tests/data/model/',
+                                                       pretrained_allow_missing=True,
+                                                       use_decoder=False)
+                else:
+                    assert False, "Testcase needs to be adapted."
 
             assert len(vocab) == vocab_size[dataset]
             for token in special_tokens:

From ff53a5738c4845028c653850dcc394b143aa0d59 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Sat, 8 Jun 2019 08:16:17 +0000
Subject: [PATCH 16/18] Address comments

---
 scripts/bert/conversion_tools/convert_pytorch_model.py    | 4 ++--
 scripts/bert/conversion_tools/convert_tf_model.py         | 4 ++--
 .../infer_pytorch_gluon_parameter_name_mapping.py         | 8 +++-----
 scripts/bert/utils.py                                     | 4 ++--
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/scripts/bert/conversion_tools/convert_pytorch_model.py b/scripts/bert/conversion_tools/convert_pytorch_model.py
index 6bfbfd866a..978e377e98 100644
--- a/scripts/bert/conversion_tools/convert_pytorch_model.py
+++ b/scripts/bert/conversion_tools/convert_pytorch_model.py
@@ -32,7 +32,7 @@
 from gluonnlp.model.bert import bert_hparams
 
 sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-from utils import get_hash, load_tf_vocab, tf_vocab_to_gluon_vocab
+from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab
 
 parser = argparse.ArgumentParser(description='Conversion script for PyTorch BERT model',
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -52,7 +52,7 @@
 logging.info(args)
 
 # convert vocabulary
-vocab = tf_vocab_to_gluon_vocab(load_tf_vocab(args.vocab_file))
+vocab = tf_vocab_to_gluon_vocab(load_text_vocab(args.vocab_file))
 
 # vocab serialization
 tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
diff --git a/scripts/bert/conversion_tools/convert_tf_model.py b/scripts/bert/conversion_tools/convert_tf_model.py
index 8eca89c5b6..6dd0806486 100644
--- a/scripts/bert/conversion_tools/convert_tf_model.py
+++ b/scripts/bert/conversion_tools/convert_tf_model.py
@@ -32,7 +32,7 @@
 
 sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
 
-from utils import (get_hash, load_tf_vocab, read_tf_checkpoint,
+from utils import (get_hash, load_text_vocab, read_tf_checkpoint,
                    tf_vocab_to_gluon_vocab)
 
 
@@ -64,7 +64,7 @@
 
 # convert vocabulary
 vocab_path = os.path.join(args.tf_checkpoint_dir, 'vocab.txt')
-vocab = tf_vocab_to_gluon_vocab(load_tf_vocab(vocab_path))
+vocab = tf_vocab_to_gluon_vocab(load_text_vocab(vocab_path))
 
 # vocab serialization
 tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
diff --git a/scripts/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py b/scripts/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
index ce9d11d7a0..7797805735 100644
--- a/scripts/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
+++ b/scripts/bert/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
@@ -35,7 +35,7 @@
 import torch
 
 sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-from utils import load_tf_vocab, tf_vocab_to_gluon_vocab
+from utils import load_text_vocab, tf_vocab_to_gluon_vocab
 
 parser = argparse.ArgumentParser(description='Pytorch BERT Naming Convention',
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -53,9 +53,7 @@
 logging.info(args)
 
 # Load Gluon Model
-# TODO root is only set until parameters are uploaded to S3
-bert, vocab = nlp.model.get_model(args.model, dataset_name=args.dataset_name, pretrained=True,
-                                  root='/home/ubuntu/projects/gluon-nlp/tests/data/model/')
+bert, vocab = nlp.model.get_model(args.model, dataset_name=args.dataset_name, pretrained=True)
 parameters = bert._collect_params_with_prefix()
 parameters = {k: v.data().asnumpy() for k, v in parameters.items()}
 
@@ -63,7 +61,7 @@
 pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
                                 map_location=lambda storage, loc: storage)
 pytorch_vocab = tf_vocab_to_gluon_vocab(
-    load_tf_vocab(os.path.join(args.pytorch_checkpoint_dir, 'vocab.txt')))
+    load_text_vocab(os.path.join(args.pytorch_checkpoint_dir, 'vocab.txt')))
 pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
 
 # Assert that vocabularies are equal
diff --git a/scripts/bert/utils.py b/scripts/bert/utils.py
index 62c8ea3730..fe1d9abfa1 100644
--- a/scripts/bert/utils.py
+++ b/scripts/bert/utils.py
@@ -26,7 +26,7 @@
 import mxnet as mx
 import gluonnlp as nlp
 
-__all__ = ['tf_vocab_to_gluon_vocab', 'load_tf_vocab']
+__all__ = ['tf_vocab_to_gluon_vocab', 'load_text_vocab']
 
 
 def tf_vocab_to_gluon_vocab(tf_vocab):
@@ -76,7 +76,7 @@ def profile(curr_step, start_step, end_step, profile_name='profile.json',
         if early_exit:
             exit()
 
-def load_tf_vocab(vocab_file):
+def load_text_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
     index = 0

From 16222f440d6b2b06b83b4735f243987d712fcb25 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Sat, 8 Jun 2019 19:46:12 +0000
Subject: [PATCH 17/18] Rename biobert and clinicalbert models

---
 src/gluonnlp/data/utils.py    | 10 +++++-----
 src/gluonnlp/model/bert.py    | 10 +++++-----
 tests/unittest/test_models.py | 18 +++++++++---------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/gluonnlp/data/utils.py b/src/gluonnlp/data/utils.py
index c2753a1c67..24369be9aa 100644
--- a/src/gluonnlp/data/utils.py
+++ b/src/gluonnlp/data/utils.py
@@ -232,11 +232,11 @@ def _slice_pad_length(num_items, length, overlap=0):
                'scibert_scivocab_cased': '2c714475b521ab8542cb65e46259f6bfeed8041b',
                'scibert_basevocab_uncased': '80ef760a6bdafec68c99b691c94ebbb918c90d02',
                'scibert_basevocab_cased': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
-               'biobert_v1.0_pmc': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
-               'biobert_v1.0_pubmed': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
-               'biobert_v1.0_pubmed_pmc': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
-               'biobert_v1.1_pubmed': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
-               'clinicalbert': '80ef760a6bdafec68c99b691c94ebbb918c90d02'}
+               'biobert_v1.0_pmc_cased': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'biobert_v1.0_pubmed_cased': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'biobert_v1.0_pubmed_pmc_cased': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'biobert_v1.1_pubmed_cased': 'a4ff6fe1f85ba95f3010742b9abc3a818976bb2c',
+               'clinicalbert_uncased': '80ef760a6bdafec68c99b691c94ebbb918c90d02'}
 
 
 _url_format = '{repo_url}gluon/dataset/vocab/{file_name}.zip'
diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index 21dbb603b0..582a32e4cf 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -494,11 +494,11 @@ def _decode(self, sequence, masked_positions):
         ('adf9c81e72ac286a37b9002da8df9e50a753d98b', 'bert_12_768_12_scibert_scivocab_cased'),
         ('75acea8e8386890120533d6c0032b0b3fcb2d536', 'bert_12_768_12_scibert_basevocab_uncased'),
         ('8e86e5de55d6dae99123312cd8cdd8183a75e057', 'bert_12_768_12_scibert_basevocab_cased'),
-        ('a07780385add682f609772e81ec64aca77c9fb05', 'bert_12_768_12_biobert_v1.0_pmc'),
-        ('280ad1cc487db90489f86189e045e915b35e7489', 'bert_12_768_12_biobert_v1.0_pubmed'),
-        ('8a8c75441f028a6b928b11466f3d30f4360dfff5', 'bert_12_768_12_biobert_v1.0_pubmed_pmc'),
-        ('55f15c5d23829f6ee87622b68711b15fef50e55b', 'bert_12_768_12_biobert_v1.1_pubmed'),
-        ('60281c98ba3572dfdaac75131fa96e2136d70d5c', 'bert_12_768_12_clinicalbert'),
+        ('a07780385add682f609772e81ec64aca77c9fb05', 'bert_12_768_12_biobert_v1.0_pmc_cased'),
+        ('280ad1cc487db90489f86189e045e915b35e7489', 'bert_12_768_12_biobert_v1.0_pubmed_cased'),
+        ('8a8c75441f028a6b928b11466f3d30f4360dfff5', 'bert_12_768_12_biobert_v1.0_pubmed_pmc_cased'),
+        ('55f15c5d23829f6ee87622b68711b15fef50e55b', 'bert_12_768_12_biobert_v1.1_pubmed_cased'),
+        ('60281c98ba3572dfdaac75131fa96e2136d70d5c', 'bert_12_768_12_clinicalbert_uncased'),
     ]})
 
 bert_12_768_12_hparams = {
diff --git a/tests/unittest/test_models.py b/tests/unittest/test_models.py
index 949805d0cf..3f9e842d92 100644
--- a/tests/unittest/test_models.py
+++ b/tests/unittest/test_models.py
@@ -104,10 +104,10 @@ def test_pretrained_bert_models(disable_missing_parameters):
     pretrained = {
         'bert_12_768_12': [
             'book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased', 'wiki_multilingual_uncased',
-            'wiki_multilingual_cased', 'wiki_cn_cased',
-            'scibert_scivocab_uncased', 'scibert_scivocab_cased', 'scibert_basevocab_uncased',
-            'scibert_basevocab_cased', 'biobert_v1.0_pmc', 'biobert_v1.0_pubmed',
-            'biobert_v1.0_pubmed_pmc', 'biobert_v1.1_pubmed', 'clinicalbert'
+            'wiki_multilingual_cased', 'wiki_cn_cased', 'scibert_scivocab_uncased',
+            'scibert_scivocab_cased', 'scibert_basevocab_uncased', 'scibert_basevocab_cased',
+            'biobert_v1.0_pmc_cased', 'biobert_v1.0_pubmed_cased', 'biobert_v1.0_pubmed_pmc_cased',
+            'biobert_v1.1_pubmed_cased', 'clinicalbert_uncased'
         ],
         'bert_24_1024_16': ['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased']
     }
@@ -120,11 +120,11 @@ def test_pretrained_bert_models(disable_missing_parameters):
                   'scibert_scivocab_cased': 31116,
                   'scibert_basevocab_uncased': 30522,
                   'scibert_basevocab_cased': 28996,
-                  'biobert_v1.0_pubmed': 28996,
-                  'biobert_v1.0_pmc': 28996,
-                  'biobert_v1.0_pubmed_pmc': 28996,
-                  'biobert_v1.1_pubmed': 28996,
-                  'clinicalbert': 30522}
+                  'biobert_v1.0_pubmed_cased': 28996,
+                  'biobert_v1.0_pmc_cased': 28996,
+                  'biobert_v1.0_pubmed_pmc_cased': 28996,
+                  'biobert_v1.1_pubmed_cased': 28996,
+                  'clinicalbert_uncased': 30522}
     special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]']
     ones = mx.nd.ones((2, 10))
     valid_length = mx.nd.ones((2,))

From 277ceba656f5f80a255dbe3c960c763ed75d1ec1 Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Sat, 8 Jun 2019 12:53:51 -0700
Subject: [PATCH 18/18] Update bert.py

---
 src/gluonnlp/model/bert.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
index 582a32e4cf..742a8d190b 100644
--- a/src/gluonnlp/model/bert.py
+++ b/src/gluonnlp/model/bert.py
@@ -496,7 +496,8 @@ def _decode(self, sequence, masked_positions):
         ('8e86e5de55d6dae99123312cd8cdd8183a75e057', 'bert_12_768_12_scibert_basevocab_cased'),
         ('a07780385add682f609772e81ec64aca77c9fb05', 'bert_12_768_12_biobert_v1.0_pmc_cased'),
         ('280ad1cc487db90489f86189e045e915b35e7489', 'bert_12_768_12_biobert_v1.0_pubmed_cased'),
-        ('8a8c75441f028a6b928b11466f3d30f4360dfff5', 'bert_12_768_12_biobert_v1.0_pubmed_pmc_cased'),
+        ('8a8c75441f028a6b928b11466f3d30f4360dfff5',
+         'bert_12_768_12_biobert_v1.0_pubmed_pmc_cased'),
         ('55f15c5d23829f6ee87622b68711b15fef50e55b', 'bert_12_768_12_biobert_v1.1_pubmed_cased'),
         ('60281c98ba3572dfdaac75131fa96e2136d70d5c', 'bert_12_768_12_clinicalbert_uncased'),
     ]})