From 5e11334f5c00fd2875ab75c670b2595c560c30fc Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Sat, 16 Nov 2019 07:39:56 +0800 Subject: [PATCH] Switch CI to Python 3.5 and declare Python 3.5 support (#1009) * Switch CI to Python 3.5 and declare Python 3.5 support * Fix lint after updating pylint * Fix imports * Workaround bug in mx-theme setup.py * Fix Python 3.5 support in codebase * Fix lint in scripts/bert * Delete Python3.5 incompatible multiprocessing.Pool usage of BERTDatasetTransform Traceback (most recent call last): File "./scripts/bert/finetune_classifier.py", line 373, in bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary, args.pad) File "./scripts/bert/finetune_classifier.py", line 306, in preprocess_data data_train = mx.gluon.data.SimpleDataset(pool.map(trans, train_tsv)) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@6/conda/gpu/py3-master/lib/python3.5/multiprocessing/pool.py", line 266, in map return self._map_async(func, iterable, mapstar, chunksize).get() File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@6/conda/gpu/py3-master/lib/python3.5/multiprocessing/pool.py", line 644, in get raise self._value File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@6/conda/gpu/py3-master/lib/python3.5/multiprocessing/pool.py", line 424, in _handle_tasks put(task) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@6/conda/gpu/py3-master/lib/python3.5/multiprocessing/connection.py", line 206, in send self._send_bytes(ForkingPickler.dumps(obj)) File "/var/lib/jenkins/workspace/gluon-nlp-gpu-py3-master@6/conda/gpu/py3-master/lib/python3.5/multiprocessing/reduction.py", line 50, in dumps cls(buf, protocol).dump(obj) _pickle.PicklingError: Can't pickle : attribute lookup module on builtins failed --- README.rst | 6 +- docs/install/install-include.rst | 2 +- env/cpu/py3-master.yml | 36 +++++------ env/docker/py3.yml | 59 ++++++++++--------- env/gpu/py3-master.yml | 56 +++++++++--------- .../convert_paddle_to_gluon.py | 2 +- scripts/bert/data/pretrain.py | 2 +- scripts/bert/finetune_classifier.py | 16 ++--- scripts/bert/model/ner.py | 3 +- scripts/bert/pretraining_utils.py | 3 +- scripts/bert/run_pretraining.py | 5 +- scripts/bert/utils.py | 5 +- scripts/language_model/transformer/data.py | 4 +- setup.py | 2 +- src/gluonnlp/data/batchify/embedding.py | 22 +++---- src/gluonnlp/data/stream.py | 3 +- src/gluonnlp/data/transforms.py | 24 ++++---- src/gluonnlp/data/utils.py | 1 + .../data/word_embedding_evaluation.py | 3 +- src/gluonnlp/initializer/initializer.py | 2 +- src/gluonnlp/model/__init__.py | 24 ++++---- src/gluonnlp/model/attention_cell.py | 11 ++-- src/gluonnlp/model/bert.py | 4 +- src/gluonnlp/model/convolutional_encoder.py | 2 +- src/gluonnlp/model/highway.py | 2 +- src/gluonnlp/model/language_model.py | 2 +- src/gluonnlp/model/utils.py | 2 +- src/gluonnlp/optimizer/bert_adam.py | 18 +----- src/gluonnlp/utils/files.py | 2 +- src/gluonnlp/utils/version.py | 1 + src/gluonnlp/vocab/__init__.py | 6 +- src/gluonnlp/vocab/vocab.py | 7 +-- 32 files changed, 158 insertions(+), 179 deletions(-) diff --git a/README.rst b/README.rst index d6329d3a86..10f00b6989 100644 --- a/README.rst +++ b/README.rst @@ -15,7 +15,7 @@ GluonNLP: Your Choice of Deep Learning for NLP .. raw:: html - + @@ -43,8 +43,8 @@ News Installation ============ -Make sure you have Python 3.6 or newer and a recent version of MXNet (our CI -server runs the testsuite with Python 3.6). +Make sure you have Python 3.5 or newer and a recent version of MXNet (our CI +server runs the testsuite with Python 3.5). You can install ``MXNet`` and ``GluonNLP`` using pip. diff --git a/docs/install/install-include.rst b/docs/install/install-include.rst index a2ebc586c5..13f086e7af 100644 --- a/docs/install/install-include.rst +++ b/docs/install/install-include.rst @@ -47,7 +47,7 @@ Select your preferences and run the install command. .. admonition:: Prerequisites: - Requires `pip >= 9. `_. - Python 3.6+ are supported. + Python 3.5+ are supported. .. container:: nightly diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml index 51cbbf590e..e51e77234d 100644 --- a/env/cpu/py3-master.yml +++ b/env/cpu/py3-master.yml @@ -1,26 +1,26 @@ channels: - conda-forge dependencies: - - python=3.6 - - pip=18.1 - - cython + - python=3.5 + - pip - perl - - pylint=2.3.1 - - flake8 - - sphinx=2.1.2 - - spacy>2 - - nltk - - pytest=4.5.0 - - pytest-env - - flaky=3.5.3 - - pytest-cov=2.7.1 - - mock<3 - - pytest-xdist<2 - - regex - - scipy=1.3.1 - pip: + - cython + - pytest==5.2.3 + - pytest-env==0.6.2 + - pytest-cov==2.8.1 + - pytest-xdist==1.30.0 + - pylint==2.4.4 - pylint-quotes==0.2.1 + - flaky==3.6.1 + - flake8==3.7.9 + - mock<3 + - sphinx==2.2.1 - mxnet>=1.6.0b20191103 - - sacremoses - - sentencepiece<0.2 + - scipy==1.3.2 + - regex==2019.11.1 + - nltk==3.4.5 + - sacremoses==0.0.35 + - spacy==2.2.2 + - sentencepiece==0.1.83 - sphinx-autodoc-typehints==1.7.0 diff --git a/env/docker/py3.yml b/env/docker/py3.yml index bb73125d4e..11a638464c 100644 --- a/env/docker/py3.yml +++ b/env/docker/py3.yml @@ -1,39 +1,40 @@ channels: - conda-forge dependencies: - - python=3.6 - - pip=18.1 - - cython + - python=3.5 + - pip - perl - - pylint=1.9.2 - - flake8 - - sphinx=2.1.2 - - spacy>2 - - nltk - - pytest=4.5.0 - - pytest-env - - flaky=3.5.3 - - pytest-cov=2.7.1 - - mock<3 - - pytest-xdist<2 - - recommonmark - pandoc=1.19.2 - - notedown - - numba>=v0.40.0 - - sphinx-gallery - - nbsphinx>=0.3.4,<0.4 - - nbconvert=5.4.0 - tornado=5.1.1 - - ipython - - ipykernel - - regex - - scipy=1.3.1 - - scikit-learn=0.21.3 + - sphinx=2.2.1 - pip: - - pylint-quotes<0.2 - - mxnet-cu100>=1.6.0b20191027 - - sacremoses - - sentencepiece<0.2 + - notedown==1.5.1 + - sphinx-gallery==0.4.0 + - recommonmark==0.6.0 + - nbconvert==5.6.1 + - nbsphinx>=0.3.4,<0.4 + - ipython + - ipykernel + - numba==0.46 - https://github.com/szha/mx-theme/tarball/master - seaborn - jieba + - scikit-learn==0.21.3 + - cython + - pytest==5.2.3 + - pytest-env==0.6.2 + - pytest-cov==2.8.1 + - pytest-xdist==1.30.0 + - pylint==2.4.4 + - pylint-quotes==0.2.1 + - flaky==3.6.1 + - flake8==3.7.9 + - mock<3 + - mxnet-cu100>=1.6.0b20191027,!=1.6.0b20191102 + - scipy==1.3.2 + - regex==2019.11.1 + - nltk==3.4.5 + - sacremoses==0.0.35 + - spacy==2.2.2 + - sentencepiece==0.1.83 + - sphinx-autodoc-typehints==1.7.0 diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml index fc2687a608..665e1bb432 100644 --- a/env/gpu/py3-master.yml +++ b/env/gpu/py3-master.yml @@ -1,39 +1,39 @@ channels: - conda-forge dependencies: - - python=3.6 - - pip=18.1 - - cython + - python=3.5 + - pip - perl - - pylint=2.3.1 - - flake8 - - sphinx=2.1.2 - - spacy>2 - - nltk - - pytest=4.5.0 - - pytest-env - - flaky=3.5.3 - - pytest-cov=2.7.1 - - mock<3 - - pytest-xdist<2 - - recommonmark - pandoc=1.19.2 - - notedown - - numba>=v0.40.0 - - sphinx-gallery - - nbsphinx>=0.3.4,<0.4 - - nbconvert=5.4.0 - tornado=5.1.1 - - ipython - - ipykernel - - regex - - scipy=1.3.1 + - sphinx=2.2.1 - pip: - - pylint-quotes<0.2 - - mxnet-cu100>=1.6.0b20191027,!=1.6.0b20191102 - - sacremoses - - sentencepiece<0.2 + - notedown==1.5.1 + - sphinx-gallery==0.4.0 + - recommonmark==0.6.0 + - nbconvert==5.6.1 + - nbsphinx>=0.3.4,<0.4 + - ipython + - ipykernel + - numba==0.46 - https://github.com/szha/mx-theme/tarball/master - seaborn - jieba + - cython + - pytest==5.2.3 + - pytest-env==0.6.2 + - pytest-cov==2.8.1 + - pytest-xdist==1.30.0 + - pylint==2.4.4 + - pylint-quotes==0.2.1 + - flaky==3.6.1 + - flake8==3.7.9 + - mock<3 + - mxnet-cu100>=1.6.0b20191027,!=1.6.0b20191102 + - scipy==1.3.2 + - regex==2019.11.1 + - nltk==3.4.5 + - sacremoses==0.0.35 + - spacy==2.2.2 + - sentencepiece==0.1.83 - sphinx-autodoc-typehints==1.7.0 diff --git a/scripts/bert/conversion_tools/convert_paddle_to_gluon.py b/scripts/bert/conversion_tools/convert_paddle_to_gluon.py index 33feab5aa9..99e18cde28 100644 --- a/scripts/bert/conversion_tools/convert_paddle_to_gluon.py +++ b/scripts/bert/conversion_tools/convert_paddle_to_gluon.py @@ -132,7 +132,7 @@ def extract_weights(args): if 'w_0' in ernie_name: fluid_array = fluid_array.transpose() state_dict[gluon_name] = fluid_array - print(f'{ernie_name} -> {gluon_name} {fluid_array.shape}') + print('{} -> {} {}'.format(ernie_name, gluon_name, fluid_array.shape)) print('extract weights done!'.center(60, '=')) return state_dict diff --git a/scripts/bert/data/pretrain.py b/scripts/bert/data/pretrain.py index 7626e57404..c66e0e6980 100644 --- a/scripts/bert/data/pretrain.py +++ b/scripts/bert/data/pretrain.py @@ -19,6 +19,7 @@ """Dataset for pre-training. """ import logging +from mxnet.gluon.data import DataLoader import gluonnlp as nlp from gluonnlp.data.batchify import Tuple, Stack, Pad try: @@ -70,7 +71,6 @@ def __init__(self, num_ctxes, vocab): Stack()) # valid_length def __call__(self, dataset, sampler): - from mxnet.gluon.data import DataLoader dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, batchify_fn=self._batchify_fn, diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py index b67c5bc9e5..b4e89ee35b 100644 --- a/scripts/bert/finetune_classifier.py +++ b/scripts/bert/finetune_classifier.py @@ -39,10 +39,10 @@ import random import logging import warnings -import multiprocessing import numpy as np import mxnet as mx from mxnet import gluon +from mxnet.contrib.amp import amp import gluonnlp as nlp from gluonnlp.data import BERTTokenizer from gluonnlp.model import BERTClassifier, RoBERTaClassifier @@ -208,7 +208,6 @@ # data type with mixed precision training if args.dtype == 'float16': try: - from mxnet.contrib import amp # pylint: disable=ungrouped-imports # monkey patch amp list since topk does not support fp16 amp.lists.symbol.FP32_FUNCS.append('topk') amp.lists.symbol.FP16_FP32_FUNCS.remove('topk') @@ -216,11 +215,6 @@ except ValueError: # topk is already in the FP32_FUNCS list amp.init() - except ImportError: - # amp is not available - logging.info('Mixed precision training with float16 requires MXNet >= ' - '1.5.0b20190627. Please consider upgrading your MXNet version.') - exit() # model and loss only_inference = args.only_inference @@ -294,8 +288,6 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab, pad=False): """Train/eval Data preparation function.""" - pool = multiprocessing.Pool() - # transformation for data train and dev label_dtype = 'float32' if not task.class_labels else 'int32' trans = BERTDatasetTransform(tokenizer, max_len, @@ -308,7 +300,7 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab, # data train # task.dataset_train returns (segment_name, dataset) train_tsv = task.dataset_train()[1] - data_train = mx.gluon.data.SimpleDataset(pool.map(trans, train_tsv)) + data_train = mx.gluon.data.SimpleDataset(list(map(trans, train_tsv))) data_train_len = data_train.transform( lambda input_id, length, segment_id, label_id: length, lazy=False) # bucket sampler for training @@ -336,7 +328,7 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab, dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv] loader_dev_list = [] for segment, data in dev_tsv_list: - data_dev = mx.gluon.data.SimpleDataset(pool.map(trans, data)) + data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data))) loader_dev = mx.gluon.data.DataLoader( data_dev, batch_size=dev_batch_size, @@ -361,7 +353,7 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab, test_tsv_list = test_tsv if isinstance(test_tsv, list) else [test_tsv] loader_test_list = [] for segment, data in test_tsv_list: - data_test = mx.gluon.data.SimpleDataset(pool.map(test_trans, data)) + data_test = mx.gluon.data.SimpleDataset(list(map(test_trans, data))) loader_test = mx.gluon.data.DataLoader( data_test, batch_size=dev_batch_size, diff --git a/scripts/bert/model/ner.py b/scripts/bert/model/ner.py index 21000aaee2..18a2076600 100644 --- a/scripts/bert/model/ner.py +++ b/scripts/bert/model/ner.py @@ -16,6 +16,8 @@ # under the License. """Gluon model block for the named entity recognition task.""" +from contextlib import ExitStack + import mxnet as mx from mxnet.gluon import Block, nn @@ -91,7 +93,6 @@ def attach_prediction(data_loader, net, ctx, is_train): text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag = \ [x.astype('float32').as_in_context(ctx) for x in data] - from contextlib import ExitStack with ExitStack() as stack: if is_train: stack.enter_context(mx.autograd.record()) diff --git a/scripts/bert/pretraining_utils.py b/scripts/bert/pretraining_utils.py index 1ce1d099d9..2b6e1d229b 100644 --- a/scripts/bert/pretraining_utils.py +++ b/scripts/bert/pretraining_utils.py @@ -18,6 +18,7 @@ """Utilities for pre-training.""" import time import os +import sys import logging import random import multiprocessing @@ -437,4 +438,4 @@ def profile(curr_step, start_step, end_step, profile_name='profile.json', logging.info(mx.profiler.dumps()) mx.profiler.dump() if early_exit: - exit() + sys.exit(0) diff --git a/scripts/bert/run_pretraining.py b/scripts/bert/run_pretraining.py index 3c47f9473d..d7586700a0 100644 --- a/scripts/bert/run_pretraining.py +++ b/scripts/bert/run_pretraining.py @@ -29,6 +29,7 @@ # pylint:disable=redefined-outer-name,logging-format-interpolation import os +import sys import random import warnings import logging @@ -183,10 +184,10 @@ def init_comm(backend): # backend specific implementation if backend == 'horovod': try: - import horovod.mxnet as hvd + import horovod.mxnet as hvd # pylint: disable=import-outside-toplevel except ImportError: logging.info('horovod must be installed.') - exit() + sys.exit(1) hvd.init() store = None num_workers = hvd.size() diff --git a/scripts/bert/utils.py b/scripts/bert/utils.py index aebd7bdba2..c2c0b5694d 100644 --- a/scripts/bert/utils.py +++ b/scripts/bert/utils.py @@ -16,6 +16,7 @@ # under the License. """Utility functions for BERT.""" +import sys import logging import collections import hashlib @@ -48,7 +49,7 @@ def get_hash(filename): def read_tf_checkpoint(path): """read tensorflow checkpoint""" - from tensorflow.python import pywrap_tensorflow + from tensorflow.python import pywrap_tensorflow # pylint: disable=import-outside-toplevel tensors = {} reader = pywrap_tensorflow.NewCheckpointReader(path) var_to_shape_map = reader.get_variable_to_shape_map() @@ -72,7 +73,7 @@ def profile(curr_step, start_step, end_step, profile_name='profile.json', logging.info(mx.profiler.dumps()) mx.profiler.dump() if early_exit: - exit() + sys.exit(0) def load_text_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" diff --git a/scripts/language_model/transformer/data.py b/scripts/language_model/transformer/data.py index 4f808c29f4..b4b0ef3b7f 100644 --- a/scripts/language_model/transformer/data.py +++ b/scripts/language_model/transformer/data.py @@ -62,7 +62,7 @@ def __init__(self, sentencepiece_path: str, lower: bool = False, remove_space: b self._lower = lower self._remove_space = remove_space self._keep_accents = keep_accents - self._sentencepiece: Optional[nlp.data.SentencepieceTokenizer] = None + self._sentencepiece = None # type: Optional[nlp.data.SentencepieceTokenizer] def __call__(self, sample: str) -> List[str]: """Tokenize a sample. @@ -94,7 +94,7 @@ def __call__(self, sample: str) -> List[str]: self._sentencepiece = nlp.data.SentencepieceTokenizer(self._sentencepiece_path) pieces = self._sentencepiece(outputs) - new_pieces: List[str] = [] + new_pieces = [] # type: List[str] for piece in pieces: if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit(): cur_pieces = self._sentencepiece(piece[:-1].replace(self._spiece_prefix, '')) diff --git a/setup.py b/setup.py index 7d033b009b..103ba8482e 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def find_version(*file_paths): # Metadata name='gluonnlp', version=VERSION, - python_requires='>=3.6', + python_requires='>=3.5', author='Gluon NLP Toolkit Contributors', author_email='mxnet-gluon@amazon.com', url='https://github.com/dmlc/gluon-nlp', diff --git a/src/gluonnlp/data/batchify/embedding.py b/src/gluonnlp/data/batchify/embedding.py index fbdb46b2df..bce48271be 100644 --- a/src/gluonnlp/data/batchify/embedding.py +++ b/src/gluonnlp/data/batchify/embedding.py @@ -206,21 +206,21 @@ def _context_generator(sentence_boundaries, window, batch_size, # In SkipGram mode, there may be some leftover contexts # form the last batch continue - elif i < num_rows: - num_context_skip = 0 - context_row.append(i) - context_col.append(context) - if cbow: - context_data.append(1.0 / len(contexts)) - else: - center_batch.append(center) - context_data.append(1) - i += 1 - else: + if i >= num_rows: num_context_skip = j assert not cbow break + num_context_skip = 0 + context_row.append(i) + context_col.append(context) + if cbow: + context_data.append(1.0 / len(contexts)) + else: + center_batch.append(center) + context_data.append(1) + i += 1 + if cbow: center_batch.append(center) i += 1 diff --git a/src/gluonnlp/data/stream.py b/src/gluonnlp/data/stream.py index e8a6b1e1e1..62c65ce24a 100644 --- a/src/gluonnlp/data/stream.py +++ b/src/gluonnlp/data/stream.py @@ -270,8 +270,7 @@ def run(self): c = self._controlq.get(False) if c is None: break - else: - raise RuntimeError('Got unexpected control code {}'.format(repr(c))) + raise RuntimeError('Got unexpected control code {}'.format(repr(c))) except queue.Empty: pass except RuntimeError as e: diff --git a/src/gluonnlp/data/transforms.py b/src/gluonnlp/data/transforms.py index 139cf8c4cc..e3ac26ecf5 100644 --- a/src/gluonnlp/data/transforms.py +++ b/src/gluonnlp/data/transforms.py @@ -178,7 +178,7 @@ class SacreMosesTokenizer: """ def __init__(self): - from sacremoses import MosesTokenizer + from sacremoses import MosesTokenizer # pylint: disable=import-outside-toplevel self._tokenizer = MosesTokenizer() def __call__(self, sample: str, return_str: bool = False): @@ -228,8 +228,8 @@ class SpacyTokenizer: def __init__(self, lang='en_core_web_sm'): try: - import spacy - from pkg_resources import parse_version + import spacy # pylint: disable=import-outside-toplevel + from pkg_resources import parse_version # pylint: disable=import-outside-toplevel assert parse_version(spacy.__version__) >= parse_version('2.0.0'),\ 'We only support spacy>=2.0.0' except ImportError: @@ -292,7 +292,7 @@ class SacreMosesDetokenizer: def __init__(self, return_str=True): self._return_str = return_str - from sacremoses import MosesDetokenizer + from sacremoses import MosesDetokenizer # pylint: disable=import-outside-toplevel self._detokenizer = MosesDetokenizer() def __call__(self, sample: List[str], return_str: Optional[bool] = None): @@ -338,7 +338,7 @@ class JiebaTokenizer: def __init__(self): try: - import jieba + import jieba # pylint: disable=import-outside-toplevel except ImportError: raise ImportError( 'jieba is not installed. You must install jieba in order to use the ' @@ -404,7 +404,7 @@ def __init__(self, assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \ 'in order to use the NLTKStanfordSegmenter' try: - from nltk.tokenize import StanfordSegmenter + from nltk.tokenize import StanfordSegmenter # pylint: disable=import-outside-toplevel except ImportError: raise ImportError( 'NLTK or relevant packages are not installed. You must install NLTK ' @@ -474,13 +474,13 @@ def __call__(self, sample): ret : list of strs List of tokens """ - return [tok for tok in self._tokenizer.segment(sample).strip().split()] + return self._tokenizer.segment(sample).strip().split() class _SentencepieceProcessor: def __init__(self, path): try: - import sentencepiece + import sentencepiece # pylint: disable=import-outside-toplevel except ImportError: raise ImportError( 'sentencepiece is not installed. You must install sentencepiece ' @@ -1235,7 +1235,8 @@ class GPT2BPETokenizer(_GPT2BPE): '1a770728fd102bc9dc332f322e6bfb294767a685') def __init__(self, root=os.path.join(get_home_dir(), 'models')): try: - import regex as re + import regex # pylint: disable=import-outside-toplevel + self._regex = regex except ImportError: raise ImportError( 'GPT2BPETokenizer requires regex. ' @@ -1285,7 +1286,7 @@ def __init__(self, root=os.path.join(get_home_dir(), 'models')): raise ValueError('Downloaded file has different hash. Please try again.') self._read_bpe_ranks(file_path) self._cache = {} - self._token_pattern = re.compile( + self._token_pattern = self._regex.compile( r'\'s|\'t|\'re|\'ve|\'m|\'ll|\'d| ?\p{L}+' r'| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+') @@ -1347,9 +1348,8 @@ def __call__(self, sample): ------- ret : list(str) """ - import regex as re ret = [] - for word_token in re.findall(self._token_pattern, sample): + for word_token in self._regex.findall(self._token_pattern, sample): word_token = bytearray(word_token.encode('utf-8')) word_token = ''.join(self._byte_encoder[code] for code in word_token) ret.extend(self.get_bpe_subword(word_token)) diff --git a/src/gluonnlp/data/utils.py b/src/gluonnlp/data/utils.py index 8563c5f15c..121a276208 100644 --- a/src/gluonnlp/data/utils.py +++ b/src/gluonnlp/data/utils.py @@ -348,6 +348,7 @@ def _load_pretrained_vocab(name, root, cls=None): def _load_vocab_file(file_path, cls): with open(file_path, 'r') as f: if cls is None: + # pylint: disable=import-outside-toplevel from ..vocab import Vocab cls = Vocab diff --git a/src/gluonnlp/data/word_embedding_evaluation.py b/src/gluonnlp/data/word_embedding_evaluation.py index c75e64ee9e..7e65190010 100644 --- a/src/gluonnlp/data/word_embedding_evaluation.py +++ b/src/gluonnlp/data/word_embedding_evaluation.py @@ -330,8 +330,7 @@ def __init__(self, root=os.path.join(get_home_dir(), 'datasets', def _get_data(self): datafilepath = os.path.join(self.root, self._archive_file[0]) - dataset = CorpusDataset(datafilepath, tokenizer=lambda x: x.split(',')) - return [row for row in dataset] + return list(CorpusDataset(datafilepath, tokenizer=lambda x: x.split(','))) @register diff --git a/src/gluonnlp/initializer/initializer.py b/src/gluonnlp/initializer/initializer.py index 51f1fbaffd..f86031a63a 100644 --- a/src/gluonnlp/initializer/initializer.py +++ b/src/gluonnlp/initializer/initializer.py @@ -80,7 +80,7 @@ class TruncNorm(Initializer): def __init__(self, mean=0, stdev=0.01, **kwargs): super(TruncNorm, self).__init__(**kwargs) try: - from scipy.stats import truncnorm + from scipy.stats import truncnorm # pylint: disable=import-outside-toplevel except ImportError: raise ImportError('SciPy is not installed. ' 'You must install SciPy >= 1.0.0 in order to use the ' diff --git a/src/gluonnlp/model/__init__.py b/src/gluonnlp/model/__init__.py index bcca012c7f..f749da6d30 100644 --- a/src/gluonnlp/model/__init__.py +++ b/src/gluonnlp/model/__init__.py @@ -67,25 +67,27 @@ """ -from . import (attention_cell, sequence_sampler, block, convolutional_encoder, - highway, language_model, parameter, sampled_block, train, utils, bilm_encoder, - lstmpcellwithclip, elmo) +from . import (attention_cell, bert, bilm_encoder, block, + convolutional_encoder, elmo, highway, language_model, + lstmpcellwithclip, parameter, sampled_block, + seq2seq_encoder_decoder, sequence_sampler, train, transformer, + utils) from .attention_cell import * -from .sequence_sampler import * +from .bert import * +from .bilm_encoder import BiLMEncoder from .block import * from .convolutional_encoder import * -from .seq2seq_encoder_decoder import * -from .translation import * -from .transformer import * -from .bert import * +from .elmo import * from .highway import * from .language_model import * +from .lstmpcellwithclip import LSTMPCellWithClip from .parameter import * from .sampled_block import * +from .seq2seq_encoder_decoder import * +from .sequence_sampler import * +from .transformer import * +from .translation import * from .utils import * -from .bilm_encoder import BiLMEncoder -from .lstmpcellwithclip import LSTMPCellWithClip -from .elmo import * __all__ = language_model.__all__ + sequence_sampler.__all__ + attention_cell.__all__ + \ utils.__all__ + parameter.__all__ + block.__all__ + highway.__all__ + \ diff --git a/src/gluonnlp/model/attention_cell.py b/src/gluonnlp/model/attention_cell.py index cec069da36..6701020f69 100644 --- a/src/gluonnlp/model/attention_cell.py +++ b/src/gluonnlp/model/attention_cell.py @@ -23,6 +23,7 @@ import mxnet as mx from mxnet.gluon.block import HybridBlock from mxnet.gluon import nn +from mxnet.contrib.amp import amp from .block import L2Normalization @@ -46,13 +47,9 @@ def _apply_mask(F, att_score, mask, dtype): if np.dtype(dtype) == np.float16: neg = -1e4 else: - try: - # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN. - from mxnet.contrib import amp - if amp.amp._amp_initialized: - neg = -1e4 - except ImportError: - pass + # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN. + if amp._amp_initialized: + neg = -1e4 att_score = F.where(mask, att_score, neg * F.ones_like(att_score)) return att_score diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py index 753b3b526c..e9ec861a5d 100644 --- a/src/gluonnlp/model/bert.py +++ b/src/gluonnlp/model/bert.py @@ -1162,7 +1162,7 @@ def get_roberta_model(model_name=None, dataset_name=None, vocab=None, pretrained activation=predefined_args.get('activation', 'gelu'), layer_norm_eps=predefined_args.get('layer_norm_eps', 1e-5)) - from ..vocab import Vocab + from ..vocab import Vocab # pylint: disable=import-outside-toplevel bert_vocab = _load_vocab(dataset_name, vocab, root, cls=Vocab) # BERT net = RoBERTaModel(encoder, len(bert_vocab), @@ -1270,7 +1270,7 @@ def get_bert_model(model_name=None, dataset_name=None, vocab=None, pretrained=Tr activation=predefined_args.get('activation', 'gelu'), layer_norm_eps=predefined_args.get('layer_norm_eps', 1e-12)) - from ..vocab import BERTVocab + from ..vocab import BERTVocab # pylint: disable=import-outside-toplevel # bert_vocab bert_vocab = _load_vocab(dataset_name, vocab, root, cls=BERTVocab) # BERT diff --git a/src/gluonnlp/model/convolutional_encoder.py b/src/gluonnlp/model/convolutional_encoder.py index 260b0451af..2fbaa0f5f9 100644 --- a/src/gluonnlp/model/convolutional_encoder.py +++ b/src/gluonnlp/model/convolutional_encoder.py @@ -22,7 +22,7 @@ from mxnet import gluon from mxnet.gluon import nn -from gluonnlp.initializer import HighwayBias +from ..initializer import HighwayBias from .highway import Highway diff --git a/src/gluonnlp/model/highway.py b/src/gluonnlp/model/highway.py index 4daf01ffa5..30aa8ae448 100644 --- a/src/gluonnlp/model/highway.py +++ b/src/gluonnlp/model/highway.py @@ -22,7 +22,7 @@ from mxnet import gluon from mxnet.gluon import nn -from gluonnlp.initializer import HighwayBias +from ..initializer import HighwayBias class Highway(gluon.HybridBlock): diff --git a/src/gluonnlp/model/language_model.py b/src/gluonnlp/model/language_model.py index 545e4245c5..29f5eedb56 100644 --- a/src/gluonnlp/model/language_model.py +++ b/src/gluonnlp/model/language_model.py @@ -25,7 +25,7 @@ from mxnet import nd, cpu, autograd, sym from mxnet.gluon.model_zoo import model_store -from gluonnlp.model import train +from . import train from .utils import _load_vocab, _load_pretrained_params from ..base import get_home_dir diff --git a/src/gluonnlp/model/utils.py b/src/gluonnlp/model/utils.py index d8b71a4d99..50aac6ce4f 100644 --- a/src/gluonnlp/model/utils.py +++ b/src/gluonnlp/model/utils.py @@ -24,7 +24,7 @@ from mxnet.gluon import Block, contrib, rnn from mxnet.gluon.model_zoo import model_store -from gluonnlp.data.utils import _load_pretrained_vocab +from ..data.utils import _load_pretrained_vocab from .parameter import WeightDropParameter from .lstmpcellwithclip import LSTMPCellWithClip diff --git a/src/gluonnlp/optimizer/bert_adam.py b/src/gluonnlp/optimizer/bert_adam.py index 6cc6539a49..a63022d776 100644 --- a/src/gluonnlp/optimizer/bert_adam.py +++ b/src/gluonnlp/optimizer/bert_adam.py @@ -20,6 +20,7 @@ import numpy from mxnet.optimizer import Optimizer, register from mxnet.ndarray import zeros, NDArray, full +from mxnet.ndarray.contrib import mp_adamw_update, adamw_update __all__ = ['BERTAdam'] @@ -93,14 +94,6 @@ def update_multi_precision(self, index, weight, grad, state): def _update_impl(self, indices, weight, grad, state, multi_precision=False): """update function""" - try: - from mxnet.ndarray.contrib import adamw_update - except ImportError: - raise ImportError('Failed to import nd.contrib.adamw_update from MXNet. ' - 'BERTAdam optimizer requires mxnet>=1.5.0b20190220. ' - 'Please upgrade your MXNet version. For example: ' - 'pip install mxnet-cu90 --pre. Otherwise, please consider ' - 'Adam optimizer with different hyper-parameters.') self._update_count(indices) lr = self._get_lr(indices) wd = self._get_wd(indices) @@ -120,15 +113,6 @@ def _update_impl(self, indices, weight, grad, state, multi_precision=False): adamw_update(weight, grad, mean, var, out=weight, lr=1, wd=wd, eta=lr, **kwargs) else: - try: - from mxnet.ndarray.contrib import mp_adamw_update - except ImportError: - raise ImportError('Failed to import ' - 'nd.contrib.mp_adamw_update from MXNet. ' - 'BERTAdam optimizer requires mxnet>=1.5.0b20190220. ' - 'Please upgrade your MXNet version. For example: ' - 'pip install mxnet-cu90 --pre. Otherwise, please consider ' - 'Adam optimizer with different hyper-parameters.') mean, var = state[0] mp_adamw_update(weight, grad, mean, var, state[1], out=weight, lr=1, wd=wd, eta=lr, **kwargs) diff --git a/src/gluonnlp/utils/files.py b/src/gluonnlp/utils/files.py index b321bd4cd7..f512a44f13 100644 --- a/src/gluonnlp/utils/files.py +++ b/src/gluonnlp/utils/files.py @@ -85,7 +85,7 @@ def __exit__(self, exec_type, exec_value, traceback): def _transfer_file_s3(filename, s3_filename, upload=True): """Transfer a file between S3 and local file system.""" try: - import boto3 + import boto3 # pylint: disable=import-outside-toplevel except ImportError: raise ImportError('boto3 is required to support s3 URI. Please install' 'boto3 via `pip install boto3`') diff --git a/src/gluonnlp/utils/version.py b/src/gluonnlp/utils/version.py index d67051a36c..0165d41464 100644 --- a/src/gluonnlp/utils/version.py +++ b/src/gluonnlp/utils/version.py @@ -30,6 +30,7 @@ def check_version(min_version, warning_only=False): warning_only : bool Printing a warning instead of throwing an exception. """ + # pylint: disable=import-outside-toplevel from .. import __version__ from packaging.version import parse bad_version = parse(__version__.replace('.dev', '')) < parse(min_version) diff --git a/src/gluonnlp/vocab/__init__.py b/src/gluonnlp/vocab/__init__.py index 49e31d0b02..e600bfe3f4 100644 --- a/src/gluonnlp/vocab/__init__.py +++ b/src/gluonnlp/vocab/__init__.py @@ -18,10 +18,10 @@ # pylint: disable=wildcard-import """Vocabulary.""" -from . import subwords, vocab, bert +from . import bert, elmo, subwords, vocab +from .bert import * +from .elmo import * from .subwords import * from .vocab import * -from .elmo import * -from .bert import * __all__ = vocab.__all__ + subwords.__all__ + elmo.__all__ + bert.__all__ diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py index b993d9656b..46c35c4be6 100644 --- a/src/gluonnlp/vocab/vocab.py +++ b/src/gluonnlp/vocab/vocab.py @@ -597,10 +597,9 @@ def from_json(cls, json_str): if idx == token_to_idx[token]: # Valid idx continue - else: - # Introduce temporary token - token_to_idx.update({str(uuid.uuid4()): idx}) - corrected_token_to_idx[token].append(idx) + # Introduce temporary token + token_to_idx.update({str(uuid.uuid4()): idx}) + corrected_token_to_idx[token].append(idx) vocab = cls( counter=count_tokens(token_to_idx.keys()),