diff --git a/.pytype.cfg b/.pytype.cfg
new file mode 100644
index 0000000000..8220a41658
--- /dev/null
+++ b/.pytype.cfg
@@ -0,0 +1,8 @@
+# NOTE: All relative paths are relative to the location of this file.
+[pytype]
+# Space-separated list of files or directories to process.
+inputs =
+    src/gluonnlp
+
+# Python version (major.minor) of the target code.
+python_version = 3.5
diff --git a/Makefile b/Makefile
index c370c4ca7d..90b1b01e19 100644
--- a/Makefile
+++ b/Makefile
@@ -24,12 +24,16 @@ flake8:
 pylint:
 	pylint --rcfile=$(ROOTDIR)/.pylintrc $(lintdir)
 
+pytype:
+	pytype --config=$(ROOTDIR)/.pytype.cfg
+
 restruc:
 	python setup.py check --restructuredtext --strict
 
 lint:
 	make lintdir=$(lintdir) flake8
 	make lintdir=$(lintdir) pylint
+	make pytype
 	make lintdir=$(lintdir) ratcheck
 	make restruc
 
diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
index e51e77234d..5621e266bb 100644
--- a/env/cpu/py3-master.yml
+++ b/env/cpu/py3-master.yml
@@ -6,6 +6,8 @@ dependencies:
   - perl
   - pip:
     - cython
+    - boto3
+    - pytype==2019.10.17
     - pytest==5.2.3
     - pytest-env==0.6.2
     - pytest-cov==2.8.1
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
index 11a638464c..7bea6f010f 100644
--- a/env/docker/py3.yml
+++ b/env/docker/py3.yml
@@ -21,6 +21,7 @@ dependencies:
     - jieba
     - scikit-learn==0.21.3
     - cython
+    - pytype==2019.10.17
     - pytest==5.2.3
     - pytest-env==0.6.2
     - pytest-cov==2.8.1
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
index 665e1bb432..11c85a831c 100644
--- a/env/gpu/py3-master.yml
+++ b/env/gpu/py3-master.yml
@@ -20,6 +20,8 @@ dependencies:
     - seaborn
     - jieba
     - cython
+    - boto3
+    - pytype==2019.10.17
     - pytest==5.2.3
     - pytest-env==0.6.2
     - pytest-cov==2.8.1
diff --git a/src/gluonnlp/base.py b/src/gluonnlp/base.py
index 0054cf816e..ea2e9835e4 100644
--- a/src/gluonnlp/base.py
+++ b/src/gluonnlp/base.py
@@ -23,7 +23,7 @@
 __all__ = ['numba_njit', 'numba_prange', 'numba_jitclass', 'numba_types', 'get_home_dir']
 
 try:
-    from numba import njit, prange, jitclass, types
+    from numba import njit, prange, jitclass, types  # pytype: disable=import-error
     numba_njit = njit(nogil=True)
     numba_prange = prange
     numba_jitclass = jitclass
diff --git a/src/gluonnlp/data/__init__.py b/src/gluonnlp/data/__init__.py
index 79ea84b9de..d96863105d 100644
--- a/src/gluonnlp/data/__init__.py
+++ b/src/gluonnlp/data/__init__.py
@@ -39,9 +39,10 @@
 from .word_embedding_evaluation import *
 from .intent_slot import *
 
+
 __all__ = (['batchify'] + utils.__all__ + transforms.__all__ + sampler.__all__
            + dataset.__all__ + corpora.__all__ + sentiment.__all__
            + word_embedding_evaluation.__all__ + stream.__all__ + conll.__all__
            + translation.__all__ + registry.__all__ + question_answering.__all__
            + dataloader.__all__ + candidate_sampler.__all__ + intent_slot.__all__
-           + glue.__all__)
+           + glue.__all__)  # pytype: disable=attribute-error
diff --git a/src/gluonnlp/data/batchify/__init__.py b/src/gluonnlp/data/batchify/__init__.py
index 633fbbb381..7c7e1b6f71 100644
--- a/src/gluonnlp/data/batchify/__init__.py
+++ b/src/gluonnlp/data/batchify/__init__.py
@@ -18,7 +18,7 @@
 # pylint: disable=wildcard-import
 """Batchify helpers."""
 
-from . import batchify, language_model
+from . import batchify, embedding, language_model
 from .batchify import *
 from .embedding import *
 from .language_model import *
diff --git a/src/gluonnlp/data/batchify/batchify.py b/src/gluonnlp/data/batchify/batchify.py
index 8801583489..2e800b5892 100644
--- a/src/gluonnlp/data/batchify/batchify.py
+++ b/src/gluonnlp/data/batchify/batchify.py
@@ -20,9 +20,8 @@
 
 import warnings
 import math
-from typing import Dict as t_Dict, Callable as t_Callable,\
-    NamedTuple as t_NamedTuple, List as t_List, Tuple as t_Tuple, AnyStr,\
-    Union as t_Union
+from typing import (Dict as t_Dict, Callable as t_Callable, List as t_List, Tuple as t_Tuple,
+                    AnyStr, Union as t_Union)
 
 import numpy as np
 import mxnet as mx
@@ -461,8 +460,8 @@ class NamedTuple:
 
     Parameters
     ----------
-    container
-        The object that constructs the namedtuple.
+    container : NamedTuple class
+        The object that constructs the NamedTuple.
     fn_info
         The information of the inner batchify functions.
 
@@ -501,11 +500,8 @@ class NamedTuple:
     [0 1 0]
     <NDArray 3 @cpu_shared(0)>)
     """
-    def __init__(self,
-                 container: t_NamedTuple,
-                 fn_info: t_Union[t_List[t_Callable],
-                                  t_Tuple[t_Callable],
-                                  t_Dict[AnyStr, t_Callable]]):
+    def __init__(self, container, fn_info: t_Union[t_List[t_Callable], t_Tuple[t_Callable],
+                                                   t_Dict[AnyStr, t_Callable]]):
         self._container = container
         if isinstance(fn_info, (list, tuple)):
             if len(container._fields) != len(fn_info):
@@ -526,17 +522,17 @@ def __init__(self,
                 raise ValueError('All batchify functions must be callable.')
         self._fn_l = fn_info
 
-    def __call__(self, data: t_List[t_NamedTuple]) -> t_NamedTuple:
+    def __call__(self, data):
         """Batchify the input data.
 
         Parameters
         ----------
-        data
-            The samples to batchfy. Each sample should be a namedtuple.
+        data : List of NamedTuple
+            The samples to batchify. Each sample should be a NamedTuple.
 
         Returns
         -------
-        ret
+        ret : List of NamedTuple
             A namedtuple of length N. Contains the batchified result of each attribute in the input.
         """
         if not isinstance(data[0], self._container):
diff --git a/src/gluonnlp/data/batchify/embedding.py b/src/gluonnlp/data/batchify/embedding.py
index bce48271be..7f859b3116 100644
--- a/src/gluonnlp/data/batchify/embedding.py
+++ b/src/gluonnlp/data/batchify/embedding.py
@@ -24,18 +24,9 @@
 
 import numpy as np
 
+from ...base import numba_njit, numba_prange
 from ..stream import DataStream
 
-try:
-    from numba import njit, prange
-    numba_njit = njit(nogil=True)
-except ImportError:
-    # Define numba shims
-    prange = range
-
-    def numba_njit(func):
-        return func
-
 
 class EmbeddingCenterContextBatchify:
     """Helper to create batches of center and contexts words.
@@ -127,7 +118,7 @@ def __init__(self, sentences, batch_size, window_size,
         self._index_dtype = index_dtype
 
     def __iter__(self):
-        if prange is range:
+        if numba_prange is range:
             logging.warning(
                 'EmbeddingCenterContextBatchify supports just in time compilation '
                 'with numba, but numba is not installed. '
diff --git a/src/gluonnlp/data/conll.py b/src/gluonnlp/data/conll.py
index 95a4aa53f6..b2602e944e 100644
--- a/src/gluonnlp/data/conll.py
+++ b/src/gluonnlp/data/conll.py
@@ -79,7 +79,8 @@ def _read_data(self):
         results = []
         for path in paths:
             with gzip.open(path, 'r') if path.endswith('gz') else io.open(path, 'rb') as f:
-                line_iter = codecs.getreader(self.codec)(io.BufferedReader(f))
+                line_iter = codecs.getreader(self.codec)\
+                    (io.BufferedReader(f))  # pytype: disable=wrong-arg-types
                 results.append(self._process_iter(line_iter))
         return list([x for field in item for x in field] for item in zip(*results))
 
diff --git a/src/gluonnlp/data/corpora/google_billion_word.py b/src/gluonnlp/data/corpora/google_billion_word.py
index 81d9cfe350..36128a4dbc 100644
--- a/src/gluonnlp/data/corpora/google_billion_word.py
+++ b/src/gluonnlp/data/corpora/google_billion_word.py
@@ -29,41 +29,74 @@
 
 from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download
 
-from ... import _constants as C
+from ..._constants import EOS_TOKEN
+from ...base import get_home_dir
 from ...vocab import Vocab
-from ..stream import SimpleDatasetStream
 from ..dataset import CorpusDataset
-from ...base import get_home_dir
+from ..stream import SimpleDatasetStream
+
+
+class GBWStream(SimpleDatasetStream):
+    """1-Billion-Word word-level dataset for language modeling, from Google.
 
+    The GBWSream iterates over CorpusDatasets(flatten=False).
+
+    Source http://www.statmt.org/lm-benchmark
+
+    License: Apache
+
+    Parameters
+    ----------
+    segment : {'train', 'test'}, default 'train'
+        Dataset segment.
+    skip_empty : bool, default True
+        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
+        will be added in empty samples.
+    bos : str or None, default None
+        The token to add at the begining of each sentence. If None, nothing is added.
+    eos : str or None, default '<eos>'
+        The token to add at the end of each sentence. If None, nothing is added.
+    root : str, default '$MXNET_HOME/datasets/gbw'
+        Path to temp folder for storing data.
+        MXNET_HOME defaults to '~/.mxnet'.
+    """
 
-class _GBWStream(SimpleDatasetStream):
-    def __init__(self, namespace, segment, bos, eos, skip_empty, root):
-        """Directory layout:
-           - root ($MXNET_HOME/datasets/gbw)
-             - archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz)
-             - dir (1-billion-word-language-modeling-benchmark-r13output)
-               - subdir (training-monolingual.tokenized.shuffled)
-               - subdir (heldout-monolingual.tokenized.shuffled)
-        """
+    _archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz',
+                     '4df859766482e12264a5a9d9fb7f0e276020447d')
+    _archive_vocab = ('gbw-ebb1a287.zip',
+                      '63b335dcc27b6804d0a14acb88332d2602fe0f59')
+    _data_file = {'train': ('training-monolingual.tokenized.shuffled',
+                            'news.en-00*-of-00100',
+                            '5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'),
+                  'test': ('heldout-monolingual.tokenized.shuffled',
+                           'news.en.heldout-00000-of-00050',
+                           '0a8e2b7496ba0b5c05158f282b9b351356875445')}
+    _vocab_file = ('gbw-ebb1a287.vocab',
+                   'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f')
+
+    # Directory layout:
+    # - root ($MXNET_HOME/datasets/gbw)
+    #   - archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz)
+    #   - dir (1-billion-word-language-modeling-benchmark-r13output)
+    #     - subdir (training-monolingual.tokenized.shuffled)
+    #     - subdir (heldout-monolingual.tokenized.shuffled)
+
+    def __init__(self, segment='train', skip_empty=True, bos=None, eos=EOS_TOKEN,
+                 root=os.path.join(get_home_dir(), 'datasets', 'gbw')):
         root = os.path.expanduser(root)
         if not os.path.isdir(root):
             os.makedirs(root)
         self._root = root
         self._dir = os.path.join(root, '1-billion-word-language-modeling-benchmark-r13output')
-        self._namespace = 'gluon/dataset/{}'.format(namespace)
+        self._namespace = 'gluon/dataset/gbw'
         subdir_name, pattern, data_hash = self._data_file[segment]
         self._subdir = os.path.join(self._dir, subdir_name)
         self._file_pattern = os.path.join(self._subdir, pattern)
         self._data_hash = data_hash
         self._get_data()
         sampler = 'sequential' if segment != 'train' else 'random'
-        super(_GBWStream, self).__init__(
-            dataset=CorpusDataset,
-            file_pattern=self._file_pattern,
-            skip_empty=skip_empty,
-            bos=bos,
-            eos=eos,
-            file_sampler=sampler)
+        super().__init__(dataset=CorpusDataset, file_pattern=self._file_pattern,
+                         skip_empty=skip_empty, bos=bos, eos=eos, file_sampler=sampler)
 
     def _get_data(self):
         archive_file_name, archive_hash = self._archive_data
@@ -106,46 +139,6 @@ def _get_vocab(self):
                 zf.extractall(path=root)
         return path
 
-class GBWStream(_GBWStream):
-    """1-Billion-Word word-level dataset for language modeling, from Google.
-
-    The GBWSream iterates over CorpusDatasets(flatten=False).
-
-    Source http://www.statmt.org/lm-benchmark
-
-    License: Apache
-
-    Parameters
-    ----------
-    segment : {'train', 'test'}, default 'train'
-        Dataset segment.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
-        will be added in empty samples.
-    bos : str or None, default None
-        The token to add at the begining of each sentence. If None, nothing is added.
-    eos : str or None, default '<eos>'
-        The token to add at the end of each sentence. If None, nothing is added.
-    root : str, default '$MXNET_HOME/datasets/gbw'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    def __init__(self, segment='train', skip_empty=True, bos=None, eos=C.EOS_TOKEN,
-                 root=os.path.join(get_home_dir(), 'datasets', 'gbw')):
-        self._archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz',
-                              '4df859766482e12264a5a9d9fb7f0e276020447d')
-        self._archive_vocab = ('gbw-ebb1a287.zip',
-                               '63b335dcc27b6804d0a14acb88332d2602fe0f59')
-        self._data_file = {'train': ('training-monolingual.tokenized.shuffled',
-                                     'news.en-00*-of-00100',
-                                     '5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'),
-                           'test': ('heldout-monolingual.tokenized.shuffled',
-                                    'news.en.heldout-00000-of-00050',
-                                    '0a8e2b7496ba0b5c05158f282b9b351356875445')}
-        self._vocab_file = ('gbw-ebb1a287.vocab',
-                            'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f')
-        super(GBWStream, self).__init__('gbw', segment, bos, eos, skip_empty, root)
-
     @property
     def vocab(self):
         path = self._get_vocab()
diff --git a/src/gluonnlp/data/stream.py b/src/gluonnlp/data/stream.py
index 62c65ce24a..6f5d2d4f38 100644
--- a/src/gluonnlp/data/stream.py
+++ b/src/gluonnlp/data/stream.py
@@ -25,6 +25,7 @@
 import multiprocessing
 import multiprocessing.pool
 import os
+import queue
 import random
 import sys
 import threading
@@ -33,12 +34,7 @@
 import numpy as np
 
 import mxnet as mx
-from mxnet.gluon.data import RandomSampler, SequentialSampler, Sampler
-
-try:
-    import Queue as queue
-except ImportError:
-    import queue
+from mxnet.gluon.data import RandomSampler, Sampler, SequentialSampler
 
 __all__ = [
     'DataStream', 'SimpleDataStream', 'DatasetStream', 'SimpleDatasetStream',
diff --git a/src/gluonnlp/embedding/__init__.py b/src/gluonnlp/embedding/__init__.py
index 70411f6b5e..3ca8826cf1 100644
--- a/src/gluonnlp/embedding/__init__.py
+++ b/src/gluonnlp/embedding/__init__.py
@@ -18,8 +18,7 @@
 # pylint: disable=wildcard-import
 """Word embeddings."""
 
+from . import evaluation, token_embedding
 from .token_embedding import *
 
-from . import evaluation
-
 __all__ = (token_embedding.__all__ + ['evaluation'])
diff --git a/src/gluonnlp/embedding/token_embedding.py b/src/gluonnlp/embedding/token_embedding.py
index 12f2cb396b..9c3eb8ba9b 100644
--- a/src/gluonnlp/embedding/token_embedding.py
+++ b/src/gluonnlp/embedding/token_embedding.py
@@ -16,7 +16,6 @@
 # under the License.
 
 # pylint: disable=consider-iterating-dictionary, too-many-lines
-
 """Text token embedding."""
 
 __all__ = [
@@ -214,16 +213,21 @@ def __init__(self, unknown_token=C.UNK_TOKEN, init_unknown_vec=INIT_UNKNOWN_VEC,
             if idx_to_vec.shape[0] != len(idx_to_token):
                 raise ValueError('idx_to_token and idx_to_vec must contain '
                                  'the same number of tokens and embeddings respectively.')
-            if init_unknown_vec is not None:
-                logging.info('Ignoring init_unknown_vec as idx_to_vec is specified')
             if unknown_token is not None:
                 try:
                     unknown_index = idx_to_token.index(unknown_token)
+                    if init_unknown_vec is not None:
+                        logging.info('Ignoring init_unknown_vec as idx_to_vec is specified')
                 except ValueError:
-                    idx_to_token.insert(0, unknown_token)
-                    idx_to_vec = nd.concat(init_unknown_vec((1, idx_to_vec.shape[1])), idx_to_vec,
-                                           dim=0)
-                    unknown_index = 0
+                    if init_unknown_vec is not None:
+                        idx_to_token.insert(0, unknown_token)
+                        idx_to_vec = nd.concat(init_unknown_vec((1, idx_to_vec.shape[1])),
+                                               idx_to_vec, dim=0)
+                        unknown_index = 0
+                    else:
+                        raise ValueError('unknown_token "{}" is not part of idx_to_vec but '
+                                         'init_unknown_vec is None. '
+                                         'You must provide either of them.'.format(unknown_token))
 
             # Initialization
             self._unknown_token = unknown_token
@@ -350,7 +354,7 @@ def _load_embedding_txt(pretrained_file_path, elem_delim, unknown_token,
         with io.open(pretrained_file_path, 'rb') as f:
             for line_num, line in enumerate(f):
                 try:
-                    line = line.decode(encoding)
+                    line = line.decode(encoding)  # pytype: disable=attribute-error
                 except ValueError:
                     warnings.warn('line {} in {}: failed to decode. Skipping.'
                                   .format(line_num, pretrained_file_path))
@@ -1286,7 +1290,7 @@ def _load_w2v_binary(cls, pretrained_file_path, unknown_token,
         loaded_unknown_vec = None
         pretrained_file_path = os.path.expanduser(pretrained_file_path)
         with io.open(pretrained_file_path, 'rb') as f:
-            header = f.readline().decode(encoding=encoding)
+            header = f.readline().decode(encoding=encoding)  # pytype: disable=attribute-error
             vocab_size, vec_len = (int(x) for x in header.split())
             if unknown_token:
                 # Reserve a vector slot for the unknown token at the very beggining
diff --git a/src/gluonnlp/initializer/__init__.py b/src/gluonnlp/initializer/__init__.py
index 438bd087fd..8279515af8 100644
--- a/src/gluonnlp/initializer/__init__.py
+++ b/src/gluonnlp/initializer/__init__.py
@@ -18,6 +18,8 @@
 # pylint: disable=wildcard-import
 """NLP initializer."""
 
+from . import initializer
+
 from .initializer import *
 
 __all__ = initializer.__all__
diff --git a/src/gluonnlp/loss/__init__.py b/src/gluonnlp/loss/__init__.py
index 02dc553547..c87346c288 100644
--- a/src/gluonnlp/loss/__init__.py
+++ b/src/gluonnlp/loss/__init__.py
@@ -18,6 +18,8 @@
 # pylint: disable=wildcard-import
 """NLP loss."""
 
+from . import activation_regularizer, loss, label_smoothing
+
 from .activation_regularizer import *
 from .loss import *
 from .label_smoothing import *
diff --git a/src/gluonnlp/metric/__init__.py b/src/gluonnlp/metric/__init__.py
index eb753f0046..cdf34e27ff 100644
--- a/src/gluonnlp/metric/__init__.py
+++ b/src/gluonnlp/metric/__init__.py
@@ -18,6 +18,8 @@
 # pylint: disable=wildcard-import
 """NLP Metrics."""
 
+from . import masked_accuracy
+
 from .masked_accuracy import *
 
 __all__ = masked_accuracy.__all__
diff --git a/src/gluonnlp/model/sequence_sampler.py b/src/gluonnlp/model/sequence_sampler.py
index 550d0efa7b..60c034f83b 100644
--- a/src/gluonnlp/model/sequence_sampler.py
+++ b/src/gluonnlp/model/sequence_sampler.py
@@ -18,11 +18,18 @@
 
 __all__ = ['BeamSearchScorer', 'BeamSearchSampler', 'HybridBeamSearchSampler', 'SequenceSampler']
 
+from typing import TypeVar
+
 import numpy as np
+
 import mxnet as mx
 from mxnet.gluon import HybridBlock
+
 from .._constants import LARGE_NEGATIVE_FLOAT
 
+
+__T = TypeVar('__T')
+
 class BeamSearchScorer(HybridBlock):
     r"""Score function used in beam search.
 
@@ -141,7 +148,7 @@ def _reconstruct_flattened_structure(structure, flattened):
         raise NotImplementedError
 
 
-def _expand_to_beam_size(data, beam_size, batch_size, state_info=None):
+def _expand_to_beam_size(data: __T, beam_size, batch_size, state_info=None) -> __T:
     """Tile all the states to have batch_size * beam_size on the batch axis.
 
     Parameters
diff --git a/src/gluonnlp/optimizer/__init__.py b/src/gluonnlp/optimizer/__init__.py
index 7e052e4391..591fb16568 100644
--- a/src/gluonnlp/optimizer/__init__.py
+++ b/src/gluonnlp/optimizer/__init__.py
@@ -18,6 +18,8 @@
 # pylint: disable=wildcard-import
 """NLP optimizer."""
 
+from . import bert_adam, lamb
+
 from .bert_adam import *
 from .lamb import *
 
diff --git a/src/gluonnlp/utils/__init__.py b/src/gluonnlp/utils/__init__.py
index d36bfd7aac..df40b44bbc 100644
--- a/src/gluonnlp/utils/__init__.py
+++ b/src/gluonnlp/utils/__init__.py
@@ -18,11 +18,10 @@
 # pylint: disable=wildcard-import, arguments-differ
 """Module for utility functions."""
 
-from . import (parallel, parameter, files)
-
+from . import files, parallel, parameter, version
+from .files import *
 from .parallel import *
 from .parameter import *
-from .files import *
 from .version import *
 
 __all__ = parallel.__all__ + parameter.__all__ + files.__all__ + version.__all__
diff --git a/src/gluonnlp/utils/parallel.py b/src/gluonnlp/utils/parallel.py
index 76b52ac0e5..42dc5e60f7 100644
--- a/src/gluonnlp/utils/parallel.py
+++ b/src/gluonnlp/utils/parallel.py
@@ -15,11 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utility functions for parallel processing."""
+import queue
 import threading
-try:
-    import Queue as queue
-except ImportError:
-    import queue
 
 __all__ = ['Parallelizable', 'Parallel']
 
diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py
index 46c35c4be6..bbc94dd343 100644
--- a/src/gluonnlp/vocab/vocab.py
+++ b/src/gluonnlp/vocab/vocab.py
@@ -261,7 +261,8 @@ def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] =
         if token_to_idx:
             self._sort_index_according_to_user_specification(token_to_idx)
             if unknown_token:
-                self._token_to_idx._default = self._token_to_idx[unknown_token]
+                self._token_to_idx._default = \
+                    self._token_to_idx[unknown_token]  # pytype: disable=not-writable
 
 
     def _index_counter_keys(self, counter, unknown_token, special_tokens, max_size,