diff --git a/.pytype.cfg b/.pytype.cfg new file mode 100644 index 0000000000..8220a41658 --- /dev/null +++ b/.pytype.cfg @@ -0,0 +1,8 @@ +# NOTE: All relative paths are relative to the location of this file. +[pytype] +# Space-separated list of files or directories to process. +inputs = + src/gluonnlp + +# Python version (major.minor) of the target code. +python_version = 3.5 diff --git a/Makefile b/Makefile index c370c4ca7d..90b1b01e19 100644 --- a/Makefile +++ b/Makefile @@ -24,12 +24,16 @@ flake8: pylint: pylint --rcfile=$(ROOTDIR)/.pylintrc $(lintdir) +pytype: + pytype --config=$(ROOTDIR)/.pytype.cfg + restruc: python setup.py check --restructuredtext --strict lint: make lintdir=$(lintdir) flake8 make lintdir=$(lintdir) pylint + make pytype make lintdir=$(lintdir) ratcheck make restruc diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml index e51e77234d..5621e266bb 100644 --- a/env/cpu/py3-master.yml +++ b/env/cpu/py3-master.yml @@ -6,6 +6,8 @@ dependencies: - perl - pip: - cython + - boto3 + - pytype==2019.10.17 - pytest==5.2.3 - pytest-env==0.6.2 - pytest-cov==2.8.1 diff --git a/env/docker/py3.yml b/env/docker/py3.yml index 11a638464c..7bea6f010f 100644 --- a/env/docker/py3.yml +++ b/env/docker/py3.yml @@ -21,6 +21,7 @@ dependencies: - jieba - scikit-learn==0.21.3 - cython + - pytype==2019.10.17 - pytest==5.2.3 - pytest-env==0.6.2 - pytest-cov==2.8.1 diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml index 665e1bb432..11c85a831c 100644 --- a/env/gpu/py3-master.yml +++ b/env/gpu/py3-master.yml @@ -20,6 +20,8 @@ dependencies: - seaborn - jieba - cython + - boto3 + - pytype==2019.10.17 - pytest==5.2.3 - pytest-env==0.6.2 - pytest-cov==2.8.1 diff --git a/src/gluonnlp/base.py b/src/gluonnlp/base.py index 0054cf816e..ea2e9835e4 100644 --- a/src/gluonnlp/base.py +++ b/src/gluonnlp/base.py @@ -23,7 +23,7 @@ __all__ = ['numba_njit', 'numba_prange', 'numba_jitclass', 'numba_types', 'get_home_dir'] try: - from numba import njit, prange, jitclass, types + from numba import njit, prange, jitclass, types # pytype: disable=import-error numba_njit = njit(nogil=True) numba_prange = prange numba_jitclass = jitclass diff --git a/src/gluonnlp/data/__init__.py b/src/gluonnlp/data/__init__.py index 79ea84b9de..d96863105d 100644 --- a/src/gluonnlp/data/__init__.py +++ b/src/gluonnlp/data/__init__.py @@ -39,9 +39,10 @@ from .word_embedding_evaluation import * from .intent_slot import * + __all__ = (['batchify'] + utils.__all__ + transforms.__all__ + sampler.__all__ + dataset.__all__ + corpora.__all__ + sentiment.__all__ + word_embedding_evaluation.__all__ + stream.__all__ + conll.__all__ + translation.__all__ + registry.__all__ + question_answering.__all__ + dataloader.__all__ + candidate_sampler.__all__ + intent_slot.__all__ - + glue.__all__) + + glue.__all__) # pytype: disable=attribute-error diff --git a/src/gluonnlp/data/batchify/__init__.py b/src/gluonnlp/data/batchify/__init__.py index 633fbbb381..7c7e1b6f71 100644 --- a/src/gluonnlp/data/batchify/__init__.py +++ b/src/gluonnlp/data/batchify/__init__.py @@ -18,7 +18,7 @@ # pylint: disable=wildcard-import """Batchify helpers.""" -from . import batchify, language_model +from . import batchify, embedding, language_model from .batchify import * from .embedding import * from .language_model import * diff --git a/src/gluonnlp/data/batchify/batchify.py b/src/gluonnlp/data/batchify/batchify.py index 8801583489..2e800b5892 100644 --- a/src/gluonnlp/data/batchify/batchify.py +++ b/src/gluonnlp/data/batchify/batchify.py @@ -20,9 +20,8 @@ import warnings import math -from typing import Dict as t_Dict, Callable as t_Callable,\ - NamedTuple as t_NamedTuple, List as t_List, Tuple as t_Tuple, AnyStr,\ - Union as t_Union +from typing import (Dict as t_Dict, Callable as t_Callable, List as t_List, Tuple as t_Tuple, + AnyStr, Union as t_Union) import numpy as np import mxnet as mx @@ -461,8 +460,8 @@ class NamedTuple: Parameters ---------- - container - The object that constructs the namedtuple. + container : NamedTuple class + The object that constructs the NamedTuple. fn_info The information of the inner batchify functions. @@ -501,11 +500,8 @@ class NamedTuple: [0 1 0] ) """ - def __init__(self, - container: t_NamedTuple, - fn_info: t_Union[t_List[t_Callable], - t_Tuple[t_Callable], - t_Dict[AnyStr, t_Callable]]): + def __init__(self, container, fn_info: t_Union[t_List[t_Callable], t_Tuple[t_Callable], + t_Dict[AnyStr, t_Callable]]): self._container = container if isinstance(fn_info, (list, tuple)): if len(container._fields) != len(fn_info): @@ -526,17 +522,17 @@ def __init__(self, raise ValueError('All batchify functions must be callable.') self._fn_l = fn_info - def __call__(self, data: t_List[t_NamedTuple]) -> t_NamedTuple: + def __call__(self, data): """Batchify the input data. Parameters ---------- - data - The samples to batchfy. Each sample should be a namedtuple. + data : List of NamedTuple + The samples to batchify. Each sample should be a NamedTuple. Returns ------- - ret + ret : List of NamedTuple A namedtuple of length N. Contains the batchified result of each attribute in the input. """ if not isinstance(data[0], self._container): diff --git a/src/gluonnlp/data/batchify/embedding.py b/src/gluonnlp/data/batchify/embedding.py index bce48271be..7f859b3116 100644 --- a/src/gluonnlp/data/batchify/embedding.py +++ b/src/gluonnlp/data/batchify/embedding.py @@ -24,18 +24,9 @@ import numpy as np +from ...base import numba_njit, numba_prange from ..stream import DataStream -try: - from numba import njit, prange - numba_njit = njit(nogil=True) -except ImportError: - # Define numba shims - prange = range - - def numba_njit(func): - return func - class EmbeddingCenterContextBatchify: """Helper to create batches of center and contexts words. @@ -127,7 +118,7 @@ def __init__(self, sentences, batch_size, window_size, self._index_dtype = index_dtype def __iter__(self): - if prange is range: + if numba_prange is range: logging.warning( 'EmbeddingCenterContextBatchify supports just in time compilation ' 'with numba, but numba is not installed. ' diff --git a/src/gluonnlp/data/conll.py b/src/gluonnlp/data/conll.py index 95a4aa53f6..b2602e944e 100644 --- a/src/gluonnlp/data/conll.py +++ b/src/gluonnlp/data/conll.py @@ -79,7 +79,8 @@ def _read_data(self): results = [] for path in paths: with gzip.open(path, 'r') if path.endswith('gz') else io.open(path, 'rb') as f: - line_iter = codecs.getreader(self.codec)(io.BufferedReader(f)) + line_iter = codecs.getreader(self.codec)\ + (io.BufferedReader(f)) # pytype: disable=wrong-arg-types results.append(self._process_iter(line_iter)) return list([x for field in item for x in field] for item in zip(*results)) diff --git a/src/gluonnlp/data/corpora/google_billion_word.py b/src/gluonnlp/data/corpora/google_billion_word.py index 81d9cfe350..36128a4dbc 100644 --- a/src/gluonnlp/data/corpora/google_billion_word.py +++ b/src/gluonnlp/data/corpora/google_billion_word.py @@ -29,41 +29,74 @@ from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download -from ... import _constants as C +from ..._constants import EOS_TOKEN +from ...base import get_home_dir from ...vocab import Vocab -from ..stream import SimpleDatasetStream from ..dataset import CorpusDataset -from ...base import get_home_dir +from ..stream import SimpleDatasetStream + + +class GBWStream(SimpleDatasetStream): + """1-Billion-Word word-level dataset for language modeling, from Google. + The GBWSream iterates over CorpusDatasets(flatten=False). + + Source http://www.statmt.org/lm-benchmark + + License: Apache + + Parameters + ---------- + segment : {'train', 'test'}, default 'train' + Dataset segment. + skip_empty : bool, default True + Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos` + will be added in empty samples. + bos : str or None, default None + The token to add at the begining of each sentence. If None, nothing is added. + eos : str or None, default '' + The token to add at the end of each sentence. If None, nothing is added. + root : str, default '$MXNET_HOME/datasets/gbw' + Path to temp folder for storing data. + MXNET_HOME defaults to '~/.mxnet'. + """ -class _GBWStream(SimpleDatasetStream): - def __init__(self, namespace, segment, bos, eos, skip_empty, root): - """Directory layout: - - root ($MXNET_HOME/datasets/gbw) - - archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz) - - dir (1-billion-word-language-modeling-benchmark-r13output) - - subdir (training-monolingual.tokenized.shuffled) - - subdir (heldout-monolingual.tokenized.shuffled) - """ + _archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz', + '4df859766482e12264a5a9d9fb7f0e276020447d') + _archive_vocab = ('gbw-ebb1a287.zip', + '63b335dcc27b6804d0a14acb88332d2602fe0f59') + _data_file = {'train': ('training-monolingual.tokenized.shuffled', + 'news.en-00*-of-00100', + '5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'), + 'test': ('heldout-monolingual.tokenized.shuffled', + 'news.en.heldout-00000-of-00050', + '0a8e2b7496ba0b5c05158f282b9b351356875445')} + _vocab_file = ('gbw-ebb1a287.vocab', + 'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f') + + # Directory layout: + # - root ($MXNET_HOME/datasets/gbw) + # - archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz) + # - dir (1-billion-word-language-modeling-benchmark-r13output) + # - subdir (training-monolingual.tokenized.shuffled) + # - subdir (heldout-monolingual.tokenized.shuffled) + + def __init__(self, segment='train', skip_empty=True, bos=None, eos=EOS_TOKEN, + root=os.path.join(get_home_dir(), 'datasets', 'gbw')): root = os.path.expanduser(root) if not os.path.isdir(root): os.makedirs(root) self._root = root self._dir = os.path.join(root, '1-billion-word-language-modeling-benchmark-r13output') - self._namespace = 'gluon/dataset/{}'.format(namespace) + self._namespace = 'gluon/dataset/gbw' subdir_name, pattern, data_hash = self._data_file[segment] self._subdir = os.path.join(self._dir, subdir_name) self._file_pattern = os.path.join(self._subdir, pattern) self._data_hash = data_hash self._get_data() sampler = 'sequential' if segment != 'train' else 'random' - super(_GBWStream, self).__init__( - dataset=CorpusDataset, - file_pattern=self._file_pattern, - skip_empty=skip_empty, - bos=bos, - eos=eos, - file_sampler=sampler) + super().__init__(dataset=CorpusDataset, file_pattern=self._file_pattern, + skip_empty=skip_empty, bos=bos, eos=eos, file_sampler=sampler) def _get_data(self): archive_file_name, archive_hash = self._archive_data @@ -106,46 +139,6 @@ def _get_vocab(self): zf.extractall(path=root) return path -class GBWStream(_GBWStream): - """1-Billion-Word word-level dataset for language modeling, from Google. - - The GBWSream iterates over CorpusDatasets(flatten=False). - - Source http://www.statmt.org/lm-benchmark - - License: Apache - - Parameters - ---------- - segment : {'train', 'test'}, default 'train' - Dataset segment. - skip_empty : bool, default True - Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos` - will be added in empty samples. - bos : str or None, default None - The token to add at the begining of each sentence. If None, nothing is added. - eos : str or None, default '' - The token to add at the end of each sentence. If None, nothing is added. - root : str, default '$MXNET_HOME/datasets/gbw' - Path to temp folder for storing data. - MXNET_HOME defaults to '~/.mxnet'. - """ - def __init__(self, segment='train', skip_empty=True, bos=None, eos=C.EOS_TOKEN, - root=os.path.join(get_home_dir(), 'datasets', 'gbw')): - self._archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz', - '4df859766482e12264a5a9d9fb7f0e276020447d') - self._archive_vocab = ('gbw-ebb1a287.zip', - '63b335dcc27b6804d0a14acb88332d2602fe0f59') - self._data_file = {'train': ('training-monolingual.tokenized.shuffled', - 'news.en-00*-of-00100', - '5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'), - 'test': ('heldout-monolingual.tokenized.shuffled', - 'news.en.heldout-00000-of-00050', - '0a8e2b7496ba0b5c05158f282b9b351356875445')} - self._vocab_file = ('gbw-ebb1a287.vocab', - 'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f') - super(GBWStream, self).__init__('gbw', segment, bos, eos, skip_empty, root) - @property def vocab(self): path = self._get_vocab() diff --git a/src/gluonnlp/data/stream.py b/src/gluonnlp/data/stream.py index 62c65ce24a..6f5d2d4f38 100644 --- a/src/gluonnlp/data/stream.py +++ b/src/gluonnlp/data/stream.py @@ -25,6 +25,7 @@ import multiprocessing import multiprocessing.pool import os +import queue import random import sys import threading @@ -33,12 +34,7 @@ import numpy as np import mxnet as mx -from mxnet.gluon.data import RandomSampler, SequentialSampler, Sampler - -try: - import Queue as queue -except ImportError: - import queue +from mxnet.gluon.data import RandomSampler, Sampler, SequentialSampler __all__ = [ 'DataStream', 'SimpleDataStream', 'DatasetStream', 'SimpleDatasetStream', diff --git a/src/gluonnlp/embedding/__init__.py b/src/gluonnlp/embedding/__init__.py index 70411f6b5e..3ca8826cf1 100644 --- a/src/gluonnlp/embedding/__init__.py +++ b/src/gluonnlp/embedding/__init__.py @@ -18,8 +18,7 @@ # pylint: disable=wildcard-import """Word embeddings.""" +from . import evaluation, token_embedding from .token_embedding import * -from . import evaluation - __all__ = (token_embedding.__all__ + ['evaluation']) diff --git a/src/gluonnlp/embedding/token_embedding.py b/src/gluonnlp/embedding/token_embedding.py index 12f2cb396b..9c3eb8ba9b 100644 --- a/src/gluonnlp/embedding/token_embedding.py +++ b/src/gluonnlp/embedding/token_embedding.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=consider-iterating-dictionary, too-many-lines - """Text token embedding.""" __all__ = [ @@ -214,16 +213,21 @@ def __init__(self, unknown_token=C.UNK_TOKEN, init_unknown_vec=INIT_UNKNOWN_VEC, if idx_to_vec.shape[0] != len(idx_to_token): raise ValueError('idx_to_token and idx_to_vec must contain ' 'the same number of tokens and embeddings respectively.') - if init_unknown_vec is not None: - logging.info('Ignoring init_unknown_vec as idx_to_vec is specified') if unknown_token is not None: try: unknown_index = idx_to_token.index(unknown_token) + if init_unknown_vec is not None: + logging.info('Ignoring init_unknown_vec as idx_to_vec is specified') except ValueError: - idx_to_token.insert(0, unknown_token) - idx_to_vec = nd.concat(init_unknown_vec((1, idx_to_vec.shape[1])), idx_to_vec, - dim=0) - unknown_index = 0 + if init_unknown_vec is not None: + idx_to_token.insert(0, unknown_token) + idx_to_vec = nd.concat(init_unknown_vec((1, idx_to_vec.shape[1])), + idx_to_vec, dim=0) + unknown_index = 0 + else: + raise ValueError('unknown_token "{}" is not part of idx_to_vec but ' + 'init_unknown_vec is None. ' + 'You must provide either of them.'.format(unknown_token)) # Initialization self._unknown_token = unknown_token @@ -350,7 +354,7 @@ def _load_embedding_txt(pretrained_file_path, elem_delim, unknown_token, with io.open(pretrained_file_path, 'rb') as f: for line_num, line in enumerate(f): try: - line = line.decode(encoding) + line = line.decode(encoding) # pytype: disable=attribute-error except ValueError: warnings.warn('line {} in {}: failed to decode. Skipping.' .format(line_num, pretrained_file_path)) @@ -1286,7 +1290,7 @@ def _load_w2v_binary(cls, pretrained_file_path, unknown_token, loaded_unknown_vec = None pretrained_file_path = os.path.expanduser(pretrained_file_path) with io.open(pretrained_file_path, 'rb') as f: - header = f.readline().decode(encoding=encoding) + header = f.readline().decode(encoding=encoding) # pytype: disable=attribute-error vocab_size, vec_len = (int(x) for x in header.split()) if unknown_token: # Reserve a vector slot for the unknown token at the very beggining diff --git a/src/gluonnlp/initializer/__init__.py b/src/gluonnlp/initializer/__init__.py index 438bd087fd..8279515af8 100644 --- a/src/gluonnlp/initializer/__init__.py +++ b/src/gluonnlp/initializer/__init__.py @@ -18,6 +18,8 @@ # pylint: disable=wildcard-import """NLP initializer.""" +from . import initializer + from .initializer import * __all__ = initializer.__all__ diff --git a/src/gluonnlp/loss/__init__.py b/src/gluonnlp/loss/__init__.py index 02dc553547..c87346c288 100644 --- a/src/gluonnlp/loss/__init__.py +++ b/src/gluonnlp/loss/__init__.py @@ -18,6 +18,8 @@ # pylint: disable=wildcard-import """NLP loss.""" +from . import activation_regularizer, loss, label_smoothing + from .activation_regularizer import * from .loss import * from .label_smoothing import * diff --git a/src/gluonnlp/metric/__init__.py b/src/gluonnlp/metric/__init__.py index eb753f0046..cdf34e27ff 100644 --- a/src/gluonnlp/metric/__init__.py +++ b/src/gluonnlp/metric/__init__.py @@ -18,6 +18,8 @@ # pylint: disable=wildcard-import """NLP Metrics.""" +from . import masked_accuracy + from .masked_accuracy import * __all__ = masked_accuracy.__all__ diff --git a/src/gluonnlp/model/sequence_sampler.py b/src/gluonnlp/model/sequence_sampler.py index 550d0efa7b..60c034f83b 100644 --- a/src/gluonnlp/model/sequence_sampler.py +++ b/src/gluonnlp/model/sequence_sampler.py @@ -18,11 +18,18 @@ __all__ = ['BeamSearchScorer', 'BeamSearchSampler', 'HybridBeamSearchSampler', 'SequenceSampler'] +from typing import TypeVar + import numpy as np + import mxnet as mx from mxnet.gluon import HybridBlock + from .._constants import LARGE_NEGATIVE_FLOAT + +__T = TypeVar('__T') + class BeamSearchScorer(HybridBlock): r"""Score function used in beam search. @@ -141,7 +148,7 @@ def _reconstruct_flattened_structure(structure, flattened): raise NotImplementedError -def _expand_to_beam_size(data, beam_size, batch_size, state_info=None): +def _expand_to_beam_size(data: __T, beam_size, batch_size, state_info=None) -> __T: """Tile all the states to have batch_size * beam_size on the batch axis. Parameters diff --git a/src/gluonnlp/optimizer/__init__.py b/src/gluonnlp/optimizer/__init__.py index 7e052e4391..591fb16568 100644 --- a/src/gluonnlp/optimizer/__init__.py +++ b/src/gluonnlp/optimizer/__init__.py @@ -18,6 +18,8 @@ # pylint: disable=wildcard-import """NLP optimizer.""" +from . import bert_adam, lamb + from .bert_adam import * from .lamb import * diff --git a/src/gluonnlp/utils/__init__.py b/src/gluonnlp/utils/__init__.py index d36bfd7aac..df40b44bbc 100644 --- a/src/gluonnlp/utils/__init__.py +++ b/src/gluonnlp/utils/__init__.py @@ -18,11 +18,10 @@ # pylint: disable=wildcard-import, arguments-differ """Module for utility functions.""" -from . import (parallel, parameter, files) - +from . import files, parallel, parameter, version +from .files import * from .parallel import * from .parameter import * -from .files import * from .version import * __all__ = parallel.__all__ + parameter.__all__ + files.__all__ + version.__all__ diff --git a/src/gluonnlp/utils/parallel.py b/src/gluonnlp/utils/parallel.py index 76b52ac0e5..42dc5e60f7 100644 --- a/src/gluonnlp/utils/parallel.py +++ b/src/gluonnlp/utils/parallel.py @@ -15,11 +15,8 @@ # specific language governing permissions and limitations # under the License. """Utility functions for parallel processing.""" +import queue import threading -try: - import Queue as queue -except ImportError: - import queue __all__ = ['Parallelizable', 'Parallel'] diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py index 46c35c4be6..bbc94dd343 100644 --- a/src/gluonnlp/vocab/vocab.py +++ b/src/gluonnlp/vocab/vocab.py @@ -261,7 +261,8 @@ def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] = if token_to_idx: self._sort_index_according_to_user_specification(token_to_idx) if unknown_token: - self._token_to_idx._default = self._token_to_idx[unknown_token] + self._token_to_idx._default = \ + self._token_to_idx[unknown_token] # pytype: disable=not-writable def _index_counter_keys(self, counter, unknown_token, special_tokens, max_size,