[CI] Enable type checks and inference with pytype (dmlc#1018)

* Fix __init__.py files containing undefined names * Fix use of typing.NamedTuple in batchify.py typing.NamedTuple is not a generic type that can be used as type annotation. Rather it's intended usage is for users to instantiate a specific NamedTuple class, which can then be used for typechecking. * Remove Python 2 Queue imports * Disable pytype at a few lines of code * Simplify GBWStream class hierarchy Fixes use of uninitialized attributes in former base class * Fix edge-case in TokenEmbedding that could cause calling NoneType In that case `'NoneType' object is not callable` would be generated. Instead print a proper error. * Enable type checks and inference with pytype * Fix lint * Disable pytype import-error for optional dependencies
congxie1108 · Nov 28, 2019 · 297e147 · 297e147
1 parent b2f4e7b
commit 297e147
Show file tree

Hide file tree

Showing 23 changed files with 125 additions and 115 deletions.
diff --git a/.pytype.cfg b/.pytype.cfg
@@ -0,0 +1,8 @@
+# NOTE: All relative paths are relative to the location of this file.
+[pytype]
+# Space-separated list of files or directories to process.
+inputs =
+    src/gluonnlp
+
+# Python version (major.minor) of the target code.
+python_version = 3.5
diff --git a/Makefile b/Makefile
@@ -24,12 +24,16 @@ flake8:
 pylint:
 	pylint --rcfile=$(ROOTDIR)/.pylintrc $(lintdir)
 
+pytype:
+	pytype --config=$(ROOTDIR)/.pytype.cfg
+
 restruc:
 	python setup.py check --restructuredtext --strict
 
 lint:
 	make lintdir=$(lintdir) flake8
 	make lintdir=$(lintdir) pylint
+	make pytype
 	make lintdir=$(lintdir) ratcheck
 	make restruc
 

diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
@@ -6,6 +6,8 @@ dependencies:
   - perl
   - pip:
     - cython
+    - boto3
+    - pytype==2019.10.17
     - pytest==5.2.3
     - pytest-env==0.6.2
     - pytest-cov==2.8.1

diff --git a/env/docker/py3.yml b/env/docker/py3.yml
@@ -21,6 +21,7 @@ dependencies:
     - jieba
     - scikit-learn==0.21.3
     - cython
+    - pytype==2019.10.17
     - pytest==5.2.3
     - pytest-env==0.6.2
     - pytest-cov==2.8.1

diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
@@ -20,6 +20,8 @@ dependencies:
     - seaborn
     - jieba
     - cython
+    - boto3
+    - pytype==2019.10.17
     - pytest==5.2.3
     - pytest-env==0.6.2
     - pytest-cov==2.8.1

diff --git a/src/gluonnlp/base.py b/src/gluonnlp/base.py
@@ -23,7 +23,7 @@
 __all__ = ['numba_njit', 'numba_prange', 'numba_jitclass', 'numba_types', 'get_home_dir']
 
 try:
-    from numba import njit, prange, jitclass, types
+    from numba import njit, prange, jitclass, types  # pytype: disable=import-error
     numba_njit = njit(nogil=True)
     numba_prange = prange
     numba_jitclass = jitclass

diff --git a/src/gluonnlp/data/__init__.py b/src/gluonnlp/data/__init__.py
@@ -39,9 +39,10 @@
 from .word_embedding_evaluation import *
 from .intent_slot import *
 
+
 __all__ = (['batchify'] + utils.__all__ + transforms.__all__ + sampler.__all__
            + dataset.__all__ + corpora.__all__ + sentiment.__all__
            + word_embedding_evaluation.__all__ + stream.__all__ + conll.__all__
            + translation.__all__ + registry.__all__ + question_answering.__all__
            + dataloader.__all__ + candidate_sampler.__all__ + intent_slot.__all__
-           + glue.__all__)
+           + glue.__all__)  # pytype: disable=attribute-error
diff --git a/src/gluonnlp/data/batchify/__init__.py b/src/gluonnlp/data/batchify/__init__.py
@@ -18,7 +18,7 @@
 # pylint: disable=wildcard-import
 """Batchify helpers."""
 
-from . import batchify, language_model
+from . import batchify, embedding, language_model
 from .batchify import *
 from .embedding import *
 from .language_model import *

diff --git a/src/gluonnlp/data/batchify/batchify.py b/src/gluonnlp/data/batchify/batchify.py
@@ -20,9 +20,8 @@
 
 import warnings
 import math
-from typing import Dict as t_Dict, Callable as t_Callable,\
-    NamedTuple as t_NamedTuple, List as t_List, Tuple as t_Tuple, AnyStr,\
-    Union as t_Union
+from typing import (Dict as t_Dict, Callable as t_Callable, List as t_List, Tuple as t_Tuple,
+                    AnyStr, Union as t_Union)
 
 import numpy as np
 import mxnet as mx
@@ -461,8 +460,8 @@ class NamedTuple:
 
     Parameters
     ----------
-    container
-        The object that constructs the namedtuple.
+    container : NamedTuple class
+        The object that constructs the NamedTuple.
     fn_info
         The information of the inner batchify functions.
 
@@ -501,11 +500,8 @@ class NamedTuple:
     [0 1 0]
     <NDArray 3 @cpu_shared(0)>)
     """
-    def __init__(self,
-                 container: t_NamedTuple,
-                 fn_info: t_Union[t_List[t_Callable],
-                                  t_Tuple[t_Callable],
-                                  t_Dict[AnyStr, t_Callable]]):
+    def __init__(self, container, fn_info: t_Union[t_List[t_Callable], t_Tuple[t_Callable],
+                                                   t_Dict[AnyStr, t_Callable]]):
         self._container = container
         if isinstance(fn_info, (list, tuple)):
             if len(container._fields) != len(fn_info):
@@ -526,17 +522,17 @@ def __init__(self,
                 raise ValueError('All batchify functions must be callable.')
         self._fn_l = fn_info
 
-    def __call__(self, data: t_List[t_NamedTuple]) -> t_NamedTuple:
+    def __call__(self, data):
         """Batchify the input data.
 
         Parameters
         ----------
-        data
-            The samples to batchfy. Each sample should be a namedtuple.
+        data : List of NamedTuple
+            The samples to batchify. Each sample should be a NamedTuple.
 
         Returns
         -------
-        ret
+        ret : List of NamedTuple
             A namedtuple of length N. Contains the batchified result of each attribute in the input.
         """
         if not isinstance(data[0], self._container):

diff --git a/src/gluonnlp/data/batchify/embedding.py b/src/gluonnlp/data/batchify/embedding.py
@@ -24,18 +24,9 @@
 
 import numpy as np
 
+from ...base import numba_njit, numba_prange
 from ..stream import DataStream
 
-try:
-    from numba import njit, prange
-    numba_njit = njit(nogil=True)
-except ImportError:
-    # Define numba shims
-    prange = range
-
-    def numba_njit(func):
-        return func
-
 
 class EmbeddingCenterContextBatchify:
     """Helper to create batches of center and contexts words.
@@ -127,7 +118,7 @@ def __init__(self, sentences, batch_size, window_size,
         self._index_dtype = index_dtype
 
     def __iter__(self):
-        if prange is range:
+        if numba_prange is range:
             logging.warning(
                 'EmbeddingCenterContextBatchify supports just in time compilation '
                 'with numba, but numba is not installed. '

diff --git a/src/gluonnlp/data/conll.py b/src/gluonnlp/data/conll.py
@@ -79,7 +79,8 @@ def _read_data(self):
         results = []
         for path in paths:
             with gzip.open(path, 'r') if path.endswith('gz') else io.open(path, 'rb') as f:
-                line_iter = codecs.getreader(self.codec)(io.BufferedReader(f))
+                line_iter = codecs.getreader(self.codec)\
+                    (io.BufferedReader(f))  # pytype: disable=wrong-arg-types
                 results.append(self._process_iter(line_iter))
         return list([x for field in item for x in field] for item in zip(*results))
 

diff --git a/src/gluonnlp/data/corpora/google_billion_word.py b/src/gluonnlp/data/corpora/google_billion_word.py
@@ -29,41 +29,74 @@
 
 from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download
 
-from ... import _constants as C
+from ..._constants import EOS_TOKEN
+from ...base import get_home_dir
 from ...vocab import Vocab
-from ..stream import SimpleDatasetStream
 from ..dataset import CorpusDataset
-from ...base import get_home_dir
+from ..stream import SimpleDatasetStream
+
+
+class GBWStream(SimpleDatasetStream):
+    """1-Billion-Word word-level dataset for language modeling, from Google.
 
+    The GBWSream iterates over CorpusDatasets(flatten=False).
+
+    Source http://www.statmt.org/lm-benchmark
+
+    License: Apache
+
+    Parameters
+    ----------
+    segment : {'train', 'test'}, default 'train'
+        Dataset segment.
+    skip_empty : bool, default True
+        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
+        will be added in empty samples.
+    bos : str or None, default None
+        The token to add at the begining of each sentence. If None, nothing is added.
+    eos : str or None, default '<eos>'
+        The token to add at the end of each sentence. If None, nothing is added.
+    root : str, default '$MXNET_HOME/datasets/gbw'
+        Path to temp folder for storing data.
+        MXNET_HOME defaults to '~/.mxnet'.
+    """
 
-class _GBWStream(SimpleDatasetStream):
-    def __init__(self, namespace, segment, bos, eos, skip_empty, root):
-        """Directory layout:
-           - root ($MXNET_HOME/datasets/gbw)
-             - archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz)
-             - dir (1-billion-word-language-modeling-benchmark-r13output)
-               - subdir (training-monolingual.tokenized.shuffled)
-               - subdir (heldout-monolingual.tokenized.shuffled)
-        """
+    _archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz',
+                     '4df859766482e12264a5a9d9fb7f0e276020447d')
+    _archive_vocab = ('gbw-ebb1a287.zip',
+                      '63b335dcc27b6804d0a14acb88332d2602fe0f59')
+    _data_file = {'train': ('training-monolingual.tokenized.shuffled',
+                            'news.en-00*-of-00100',
+                            '5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'),
+                  'test': ('heldout-monolingual.tokenized.shuffled',
+                           'news.en.heldout-00000-of-00050',
+                           '0a8e2b7496ba0b5c05158f282b9b351356875445')}
+    _vocab_file = ('gbw-ebb1a287.vocab',
+                   'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f')
+
+    # Directory layout:
+    # - root ($MXNET_HOME/datasets/gbw)
+    #   - archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz)
+    #   - dir (1-billion-word-language-modeling-benchmark-r13output)
+    #     - subdir (training-monolingual.tokenized.shuffled)
+    #     - subdir (heldout-monolingual.tokenized.shuffled)
+
+    def __init__(self, segment='train', skip_empty=True, bos=None, eos=EOS_TOKEN,
+                 root=os.path.join(get_home_dir(), 'datasets', 'gbw')):
         root = os.path.expanduser(root)
         if not os.path.isdir(root):
             os.makedirs(root)
         self._root = root
         self._dir = os.path.join(root, '1-billion-word-language-modeling-benchmark-r13output')
-        self._namespace = 'gluon/dataset/{}'.format(namespace)
+        self._namespace = 'gluon/dataset/gbw'
         subdir_name, pattern, data_hash = self._data_file[segment]
         self._subdir = os.path.join(self._dir, subdir_name)
         self._file_pattern = os.path.join(self._subdir, pattern)
         self._data_hash = data_hash
         self._get_data()
         sampler = 'sequential' if segment != 'train' else 'random'
-        super(_GBWStream, self).__init__(
-            dataset=CorpusDataset,
-            file_pattern=self._file_pattern,
-            skip_empty=skip_empty,
-            bos=bos,
-            eos=eos,
-            file_sampler=sampler)
+        super().__init__(dataset=CorpusDataset, file_pattern=self._file_pattern,
+                         skip_empty=skip_empty, bos=bos, eos=eos, file_sampler=sampler)
 
     def _get_data(self):
         archive_file_name, archive_hash = self._archive_data
@@ -106,46 +139,6 @@ def _get_vocab(self):
                 zf.extractall(path=root)
         return path
 
-class GBWStream(_GBWStream):
-    """1-Billion-Word word-level dataset for language modeling, from Google.
-
-    The GBWSream iterates over CorpusDatasets(flatten=False).
-
-    Source http://www.statmt.org/lm-benchmark
-
-    License: Apache
-
-    Parameters
-    ----------
-    segment : {'train', 'test'}, default 'train'
-        Dataset segment.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
-        will be added in empty samples.
-    bos : str or None, default None
-        The token to add at the begining of each sentence. If None, nothing is added.
-    eos : str or None, default '<eos>'
-        The token to add at the end of each sentence. If None, nothing is added.
-    root : str, default '$MXNET_HOME/datasets/gbw'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    def __init__(self, segment='train', skip_empty=True, bos=None, eos=C.EOS_TOKEN,
-                 root=os.path.join(get_home_dir(), 'datasets', 'gbw')):
-        self._archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz',
-                              '4df859766482e12264a5a9d9fb7f0e276020447d')
-        self._archive_vocab = ('gbw-ebb1a287.zip',
-                               '63b335dcc27b6804d0a14acb88332d2602fe0f59')
-        self._data_file = {'train': ('training-monolingual.tokenized.shuffled',
-                                     'news.en-00*-of-00100',
-                                     '5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'),
-                           'test': ('heldout-monolingual.tokenized.shuffled',
-                                    'news.en.heldout-00000-of-00050',
-                                    '0a8e2b7496ba0b5c05158f282b9b351356875445')}
-        self._vocab_file = ('gbw-ebb1a287.vocab',
-                            'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f')
-        super(GBWStream, self).__init__('gbw', segment, bos, eos, skip_empty, root)
-
     @property
     def vocab(self):
         path = self._get_vocab()

diff --git a/src/gluonnlp/data/stream.py b/src/gluonnlp/data/stream.py
@@ -25,6 +25,7 @@
 import multiprocessing
 import multiprocessing.pool
 import os
+import queue
 import random
 import sys
 import threading
@@ -33,12 +34,7 @@
 import numpy as np
 
 import mxnet as mx
-from mxnet.gluon.data import RandomSampler, SequentialSampler, Sampler
-
-try:
-    import Queue as queue
-except ImportError:
-    import queue
+from mxnet.gluon.data import RandomSampler, Sampler, SequentialSampler
 
 __all__ = [
     'DataStream', 'SimpleDataStream', 'DatasetStream', 'SimpleDatasetStream',

diff --git a/src/gluonnlp/embedding/__init__.py b/src/gluonnlp/embedding/__init__.py
@@ -18,8 +18,7 @@
 # pylint: disable=wildcard-import
 """Word embeddings."""
 
+from . import evaluation, token_embedding
 from .token_embedding import *
 
-from . import evaluation
-
 __all__ = (token_embedding.__all__ + ['evaluation'])