Skip to content

Commit

Permalink
[CI] Enable type checks and inference with pytype (dmlc#1018)
Browse files Browse the repository at this point in the history
* Fix __init__.py files containing undefined names

* Fix use of typing.NamedTuple in batchify.py

typing.NamedTuple is not a generic type that can be used as type annotation.
Rather it's intended usage is for users to instantiate a specific NamedTuple
class, which can then be used for typechecking.

* Remove Python 2 Queue imports

* Disable pytype at a few lines of code

* Simplify GBWStream class hierarchy

Fixes use of uninitialized attributes in former base class

* Fix edge-case in TokenEmbedding that could cause calling NoneType

In that case `'NoneType' object is not callable` would be generated. Instead
print a proper error.

* Enable type checks and inference with pytype

* Fix lint

* Disable pytype import-error for optional dependencies
  • Loading branch information
leezu authored Nov 28, 2019
1 parent b2f4e7b commit 297e147
Show file tree
Hide file tree
Showing 23 changed files with 125 additions and 115 deletions.
8 changes: 8 additions & 0 deletions .pytype.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# NOTE: All relative paths are relative to the location of this file.
[pytype]
# Space-separated list of files or directories to process.
inputs =
src/gluonnlp

# Python version (major.minor) of the target code.
python_version = 3.5
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,16 @@ flake8:
pylint:
pylint --rcfile=$(ROOTDIR)/.pylintrc $(lintdir)

pytype:
pytype --config=$(ROOTDIR)/.pytype.cfg

restruc:
python setup.py check --restructuredtext --strict

lint:
make lintdir=$(lintdir) flake8
make lintdir=$(lintdir) pylint
make pytype
make lintdir=$(lintdir) ratcheck
make restruc

Expand Down
2 changes: 2 additions & 0 deletions env/cpu/py3-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ dependencies:
- perl
- pip:
- cython
- boto3
- pytype==2019.10.17
- pytest==5.2.3
- pytest-env==0.6.2
- pytest-cov==2.8.1
Expand Down
1 change: 1 addition & 0 deletions env/docker/py3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dependencies:
- jieba
- scikit-learn==0.21.3
- cython
- pytype==2019.10.17
- pytest==5.2.3
- pytest-env==0.6.2
- pytest-cov==2.8.1
Expand Down
2 changes: 2 additions & 0 deletions env/gpu/py3-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ dependencies:
- seaborn
- jieba
- cython
- boto3
- pytype==2019.10.17
- pytest==5.2.3
- pytest-env==0.6.2
- pytest-cov==2.8.1
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
__all__ = ['numba_njit', 'numba_prange', 'numba_jitclass', 'numba_types', 'get_home_dir']

try:
from numba import njit, prange, jitclass, types
from numba import njit, prange, jitclass, types # pytype: disable=import-error
numba_njit = njit(nogil=True)
numba_prange = prange
numba_jitclass = jitclass
Expand Down
3 changes: 2 additions & 1 deletion src/gluonnlp/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@
from .word_embedding_evaluation import *
from .intent_slot import *


__all__ = (['batchify'] + utils.__all__ + transforms.__all__ + sampler.__all__
+ dataset.__all__ + corpora.__all__ + sentiment.__all__
+ word_embedding_evaluation.__all__ + stream.__all__ + conll.__all__
+ translation.__all__ + registry.__all__ + question_answering.__all__
+ dataloader.__all__ + candidate_sampler.__all__ + intent_slot.__all__
+ glue.__all__)
+ glue.__all__) # pytype: disable=attribute-error
2 changes: 1 addition & 1 deletion src/gluonnlp/data/batchify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# pylint: disable=wildcard-import
"""Batchify helpers."""

from . import batchify, language_model
from . import batchify, embedding, language_model
from .batchify import *
from .embedding import *
from .language_model import *
Expand Down
24 changes: 10 additions & 14 deletions src/gluonnlp/data/batchify/batchify.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@

import warnings
import math
from typing import Dict as t_Dict, Callable as t_Callable,\
NamedTuple as t_NamedTuple, List as t_List, Tuple as t_Tuple, AnyStr,\
Union as t_Union
from typing import (Dict as t_Dict, Callable as t_Callable, List as t_List, Tuple as t_Tuple,
AnyStr, Union as t_Union)

import numpy as np
import mxnet as mx
Expand Down Expand Up @@ -461,8 +460,8 @@ class NamedTuple:
Parameters
----------
container
The object that constructs the namedtuple.
container : NamedTuple class
The object that constructs the NamedTuple.
fn_info
The information of the inner batchify functions.
Expand Down Expand Up @@ -501,11 +500,8 @@ class NamedTuple:
[0 1 0]
<NDArray 3 @cpu_shared(0)>)
"""
def __init__(self,
container: t_NamedTuple,
fn_info: t_Union[t_List[t_Callable],
t_Tuple[t_Callable],
t_Dict[AnyStr, t_Callable]]):
def __init__(self, container, fn_info: t_Union[t_List[t_Callable], t_Tuple[t_Callable],
t_Dict[AnyStr, t_Callable]]):
self._container = container
if isinstance(fn_info, (list, tuple)):
if len(container._fields) != len(fn_info):
Expand All @@ -526,17 +522,17 @@ def __init__(self,
raise ValueError('All batchify functions must be callable.')
self._fn_l = fn_info

def __call__(self, data: t_List[t_NamedTuple]) -> t_NamedTuple:
def __call__(self, data):
"""Batchify the input data.
Parameters
----------
data
The samples to batchfy. Each sample should be a namedtuple.
data : List of NamedTuple
The samples to batchify. Each sample should be a NamedTuple.
Returns
-------
ret
ret : List of NamedTuple
A namedtuple of length N. Contains the batchified result of each attribute in the input.
"""
if not isinstance(data[0], self._container):
Expand Down
13 changes: 2 additions & 11 deletions src/gluonnlp/data/batchify/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,9 @@

import numpy as np

from ...base import numba_njit, numba_prange
from ..stream import DataStream

try:
from numba import njit, prange
numba_njit = njit(nogil=True)
except ImportError:
# Define numba shims
prange = range

def numba_njit(func):
return func


class EmbeddingCenterContextBatchify:
"""Helper to create batches of center and contexts words.
Expand Down Expand Up @@ -127,7 +118,7 @@ def __init__(self, sentences, batch_size, window_size,
self._index_dtype = index_dtype

def __iter__(self):
if prange is range:
if numba_prange is range:
logging.warning(
'EmbeddingCenterContextBatchify supports just in time compilation '
'with numba, but numba is not installed. '
Expand Down
3 changes: 2 additions & 1 deletion src/gluonnlp/data/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def _read_data(self):
results = []
for path in paths:
with gzip.open(path, 'r') if path.endswith('gz') else io.open(path, 'rb') as f:
line_iter = codecs.getreader(self.codec)(io.BufferedReader(f))
line_iter = codecs.getreader(self.codec)\
(io.BufferedReader(f)) # pytype: disable=wrong-arg-types
results.append(self._process_iter(line_iter))
return list([x for field in item for x in field] for item in zip(*results))

Expand Down
113 changes: 53 additions & 60 deletions src/gluonnlp/data/corpora/google_billion_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,41 +29,74 @@

from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download

from ... import _constants as C
from ..._constants import EOS_TOKEN
from ...base import get_home_dir
from ...vocab import Vocab
from ..stream import SimpleDatasetStream
from ..dataset import CorpusDataset
from ...base import get_home_dir
from ..stream import SimpleDatasetStream


class GBWStream(SimpleDatasetStream):
"""1-Billion-Word word-level dataset for language modeling, from Google.
The GBWSream iterates over CorpusDatasets(flatten=False).
Source http://www.statmt.org/lm-benchmark
License: Apache
Parameters
----------
segment : {'train', 'test'}, default 'train'
Dataset segment.
skip_empty : bool, default True
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
bos : str or None, default None
The token to add at the begining of each sentence. If None, nothing is added.
eos : str or None, default '<eos>'
The token to add at the end of each sentence. If None, nothing is added.
root : str, default '$MXNET_HOME/datasets/gbw'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
"""

class _GBWStream(SimpleDatasetStream):
def __init__(self, namespace, segment, bos, eos, skip_empty, root):
"""Directory layout:
- root ($MXNET_HOME/datasets/gbw)
- archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz)
- dir (1-billion-word-language-modeling-benchmark-r13output)
- subdir (training-monolingual.tokenized.shuffled)
- subdir (heldout-monolingual.tokenized.shuffled)
"""
_archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz',
'4df859766482e12264a5a9d9fb7f0e276020447d')
_archive_vocab = ('gbw-ebb1a287.zip',
'63b335dcc27b6804d0a14acb88332d2602fe0f59')
_data_file = {'train': ('training-monolingual.tokenized.shuffled',
'news.en-00*-of-00100',
'5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'),
'test': ('heldout-monolingual.tokenized.shuffled',
'news.en.heldout-00000-of-00050',
'0a8e2b7496ba0b5c05158f282b9b351356875445')}
_vocab_file = ('gbw-ebb1a287.vocab',
'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f')

# Directory layout:
# - root ($MXNET_HOME/datasets/gbw)
# - archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz)
# - dir (1-billion-word-language-modeling-benchmark-r13output)
# - subdir (training-monolingual.tokenized.shuffled)
# - subdir (heldout-monolingual.tokenized.shuffled)

def __init__(self, segment='train', skip_empty=True, bos=None, eos=EOS_TOKEN,
root=os.path.join(get_home_dir(), 'datasets', 'gbw')):
root = os.path.expanduser(root)
if not os.path.isdir(root):
os.makedirs(root)
self._root = root
self._dir = os.path.join(root, '1-billion-word-language-modeling-benchmark-r13output')
self._namespace = 'gluon/dataset/{}'.format(namespace)
self._namespace = 'gluon/dataset/gbw'
subdir_name, pattern, data_hash = self._data_file[segment]
self._subdir = os.path.join(self._dir, subdir_name)
self._file_pattern = os.path.join(self._subdir, pattern)
self._data_hash = data_hash
self._get_data()
sampler = 'sequential' if segment != 'train' else 'random'
super(_GBWStream, self).__init__(
dataset=CorpusDataset,
file_pattern=self._file_pattern,
skip_empty=skip_empty,
bos=bos,
eos=eos,
file_sampler=sampler)
super().__init__(dataset=CorpusDataset, file_pattern=self._file_pattern,
skip_empty=skip_empty, bos=bos, eos=eos, file_sampler=sampler)

def _get_data(self):
archive_file_name, archive_hash = self._archive_data
Expand Down Expand Up @@ -106,46 +139,6 @@ def _get_vocab(self):
zf.extractall(path=root)
return path

class GBWStream(_GBWStream):
"""1-Billion-Word word-level dataset for language modeling, from Google.
The GBWSream iterates over CorpusDatasets(flatten=False).
Source http://www.statmt.org/lm-benchmark
License: Apache
Parameters
----------
segment : {'train', 'test'}, default 'train'
Dataset segment.
skip_empty : bool, default True
Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
will be added in empty samples.
bos : str or None, default None
The token to add at the begining of each sentence. If None, nothing is added.
eos : str or None, default '<eos>'
The token to add at the end of each sentence. If None, nothing is added.
root : str, default '$MXNET_HOME/datasets/gbw'
Path to temp folder for storing data.
MXNET_HOME defaults to '~/.mxnet'.
"""
def __init__(self, segment='train', skip_empty=True, bos=None, eos=C.EOS_TOKEN,
root=os.path.join(get_home_dir(), 'datasets', 'gbw')):
self._archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz',
'4df859766482e12264a5a9d9fb7f0e276020447d')
self._archive_vocab = ('gbw-ebb1a287.zip',
'63b335dcc27b6804d0a14acb88332d2602fe0f59')
self._data_file = {'train': ('training-monolingual.tokenized.shuffled',
'news.en-00*-of-00100',
'5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'),
'test': ('heldout-monolingual.tokenized.shuffled',
'news.en.heldout-00000-of-00050',
'0a8e2b7496ba0b5c05158f282b9b351356875445')}
self._vocab_file = ('gbw-ebb1a287.vocab',
'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f')
super(GBWStream, self).__init__('gbw', segment, bos, eos, skip_empty, root)

@property
def vocab(self):
path = self._get_vocab()
Expand Down
8 changes: 2 additions & 6 deletions src/gluonnlp/data/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import multiprocessing
import multiprocessing.pool
import os
import queue
import random
import sys
import threading
Expand All @@ -33,12 +34,7 @@
import numpy as np

import mxnet as mx
from mxnet.gluon.data import RandomSampler, SequentialSampler, Sampler

try:
import Queue as queue
except ImportError:
import queue
from mxnet.gluon.data import RandomSampler, Sampler, SequentialSampler

__all__ = [
'DataStream', 'SimpleDataStream', 'DatasetStream', 'SimpleDatasetStream',
Expand Down
3 changes: 1 addition & 2 deletions src/gluonnlp/embedding/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
# pylint: disable=wildcard-import
"""Word embeddings."""

from . import evaluation, token_embedding
from .token_embedding import *

from . import evaluation

__all__ = (token_embedding.__all__ + ['evaluation'])
Loading

0 comments on commit 297e147

Please sign in to comment.