From c66cbbf132ef3d88dd8679f5a78b1a6396dee73d Mon Sep 17 00:00:00 2001 From: Vasily Date: Thu, 13 May 2021 16:47:46 +0300 Subject: [PATCH] Feat/convert ner (#1397) * Update config * Add automodel * Modify NER config * Add Russian NER config * Add upload script * Add torch sequence tagger * Modify sequence tagger * refactor: small refactor * fix: classes names in docs * fix: token_from_subtoken path in docs * Fix docs * feat: updated requirements Co-authored-by: Fedor Ignatov --- .../configs/ner/ner_conll2003_torch_bert.json | 21 +-- .../configs/ner/ner_rus_bert_torch.json | 155 ++++++++++++++++++ deeppavlov/core/common/registry.json | 4 +- .../core/common/requirements_registry.json | 4 +- .../torch_transformers_preprocessor.py | 5 +- ... => torch_transformers_sequence_tagger.py} | 24 +-- docs/apiref/models/torch_bert.rst | 6 +- docs/features/models/bert.rst | 2 +- tests/test_quick_start.py | 3 +- utils/prepare/hashes.py | 20 +-- utils/prepare/upload.py | 53 ++++++ 11 files changed, 252 insertions(+), 45 deletions(-) create mode 100644 deeppavlov/configs/ner/ner_rus_bert_torch.json rename deeppavlov/models/torch_bert/{torch_bert_sequence_tagger.py => torch_transformers_sequence_tagger.py} (94%) create mode 100644 utils/prepare/upload.py diff --git a/deeppavlov/configs/ner/ner_conll2003_torch_bert.json b/deeppavlov/configs/ner/ner_conll2003_torch_bert.json index f09188c6f9..c7df510000 100644 --- a/deeppavlov/configs/ner/ner_conll2003_torch_bert.json +++ b/deeppavlov/configs/ner/ner_conll2003_torch_bert.json @@ -17,8 +17,8 @@ ], "pipe": [ { - "class_name": "torch_bert_ner_preprocessor", - "vocab_file": "bert-base-cased", + "class_name": "torch_transformers_ner_preprocessor", + "vocab_file": "{TRANSFORMER}", "do_lower_case": false, "max_seq_length": 512, "max_subword_length": 15, @@ -41,8 +41,8 @@ "O" ], "pad_with_zeros": true, - "save_path": "{NER_PATH}/tag.dict", - "load_path": "{NER_PATH}/tag.dict", + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", "fit_on": [ "y" ], @@ -54,9 +54,9 @@ ] }, { - "class_name": "torch_bert_sequence_tagger", + "class_name": "torch_transformers_sequence_tagger", "n_tags": "#tag_vocab.len", - "pretrained_bert": "bert-base-cased", + "pretrained_bert": "{TRANSFORMER}", "attention_probs_keep_prob": 0.5, "return_probas": false, "encoder_layer_ids": [ @@ -77,8 +77,8 @@ "learning_rate_drop_patience": 30, "learning_rate_drop_div": 1.5, "load_before_drop": true, - "save_path": "{NER_PATH}/model", - "load_path": "{NER_PATH}/model", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", "in": [ "x_subword_tok_ids", "attention_mask", @@ -142,11 +142,12 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", - "NER_PATH": "{MODELS_PATH}/ner_conll2003_torch_bert" + "TRANSFORMER": "bert-base-uncased", + "MODEL_PATH": "{MODELS_PATH}/ner_conll2003_torch_bert" }, "download": [ { - "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_conll2003_torch_bert_v0.tar.gz", + "url": "http://files.deeppavlov.ai/v1/ner/ner_conll2003_torch_bert.tar.gz", "subdir": "{MODELS_PATH}" } ] diff --git a/deeppavlov/configs/ner/ner_rus_bert_torch.json b/deeppavlov/configs/ner/ner_rus_bert_torch.json new file mode 100644 index 0000000000..0c8e5c568b --- /dev/null +++ b/deeppavlov/configs/ner/ner_rus_bert_torch.json @@ -0,0 +1,155 @@ +{ + "dataset_reader": { + "class_name": "conll2003_reader", + "data_path": "{DOWNLOADS_PATH}/total_rus/", + "dataset_name": "collection_rus", + "provide_pos": false + }, + "dataset_iterator": { + "class_name": "data_learning_iterator" + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "class_name": "torch_transformers_ner_preprocessor", + "vocab_file": "{TRANSFORMER}", + "do_lower_case": false, + "max_seq_length": 512, + "max_subword_length": 15, + "token_masking_prob": 0.0, + "in": [ + "x" + ], + "out": [ + "x_tokens", + "x_subword_tokens", + "x_subword_tok_ids", + "startofword_markers", + "attention_mask" + ] + }, + { + "id": "tag_vocab", + "class_name": "simple_vocab", + "unk_token": [ + "O" + ], + "pad_with_zeros": true, + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", + "fit_on": [ + "y" + ], + "in": [ + "y" + ], + "out": [ + "y_ind" + ] + }, + { + "class_name": "torch_transformers_sequence_tagger", + "n_tags": "#tag_vocab.len", + "pretrained_bert": "{TRANSFORMER}", + "attention_probs_keep_prob": 0.5, + "return_probas": false, + "encoder_layer_ids": [ + -1 + ], + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-5, + "weight_decay": 1e-6, + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-6 + }, + "clip_norm": 1.0, + "min_learning_rate": 1e-7, + "learning_rate_drop_patience": 30, + "learning_rate_drop_div": 1.5, + "load_before_drop": true, + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "in": [ + "x_subword_tok_ids", + "attention_mask", + "startofword_markers" + ], + "in_y": [ + "y_ind" + ], + "out": [ + "y_pred_ind" + ] + }, + { + "ref": "tag_vocab", + "in": [ + "y_pred_ind" + ], + "out": [ + "y_pred" + ] + } + ], + "out": [ + "x_tokens", + "y_pred" + ] + }, + "train": { + "epochs": 30, + "batch_size": 10, + "metrics": [ + { + "name": "ner_f1", + "inputs": [ + "y", + "y_pred" + ] + }, + { + "name": "ner_token_f1", + "inputs": [ + "y", + "y_pred" + ] + } + ], + "validation_patience": 100, + "val_every_n_batches": 20, + "log_every_n_batches": 20, + "show_examples": false, + "pytest_max_batches": 2, + "pytest_batch_size": 8, + "evaluation_targets": [ + "valid", + "test" + ], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "TRANSFORMER": "DeepPavlov/rubert-base-cased", + "MODEL_PATH": "{MODELS_PATH}/ner_rus_bert_torch" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch.tar.gz", + "subdir": "{MODELS_PATH}" + } + ] + } +} diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 650b7bc05f..30f493b085 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -178,10 +178,10 @@ "tfidf_ranker": "deeppavlov.models.doc_retrieval.tfidf_ranker:TfidfRanker", "tfidf_weighted": "deeppavlov.models.embedders.tfidf_weighted_embedder:TfidfWeightedEmbedder", "top1_elector": "deeppavlov.models.spelling_correction.electors.top1_elector:TopOneElector", - "torch_bert_ner_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchBertNerPreprocessor", + "torch_transformers_ner_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersNerPreprocessor", "torch_bert_ranker": "deeppavlov.models.torch_bert.torch_bert_ranker:TorchBertRankerModel", "torch_bert_ranker_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchBertRankerPreprocessor", - "torch_bert_sequence_tagger": "deeppavlov.models.torch_bert.torch_bert_sequence_tagger:TorchBertSequenceTagger", + "torch_transformers_sequence_tagger": "deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger", "torch_squad_bert_infer": "deeppavlov.models.torch_bert.torch_bert_squad:TorchBertSQuADInferModel", "torch_squad_bert_model": "deeppavlov.models.torch_bert.torch_bert_squad:TorchBertSQuADModel", "torch_text_classification_model": "deeppavlov.models.classifiers.torch_classification_model:TorchTextClassificationModel", diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json index f6f4160fe4..9afb597ab0 100644 --- a/deeppavlov/core/common/requirements_registry.json +++ b/deeppavlov/core/common/requirements_registry.json @@ -236,7 +236,7 @@ "{DEEPPAVLOV_PATH}/requirements/pytorch16.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], - "torch_bert_sequence_tagger": [ + "torch_transformers_sequence_tagger": [ "{DEEPPAVLOV_PATH}/requirements/pytorch16.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], @@ -247,7 +247,7 @@ "{DEEPPAVLOV_PATH}/requirements/pytorch16.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], - "torch_bert_ner_preprocessor": [ + "torch_transformers_ner_preprocessor": [ "{DEEPPAVLOV_PATH}/requirements/pytorch16.txt", "{DEEPPAVLOV_PATH}/requirements/transformers.txt" ], diff --git a/deeppavlov/models/preprocessors/torch_transformers_preprocessor.py b/deeppavlov/models/preprocessors/torch_transformers_preprocessor.py index 2623842031..ca28bf2dec 100644 --- a/deeppavlov/models/preprocessors/torch_transformers_preprocessor.py +++ b/deeppavlov/models/preprocessors/torch_transformers_preprocessor.py @@ -107,8 +107,8 @@ def __call__(self, texts_a: List[str], texts_b: Optional[List[str]] = None) -> U return input_features -@register('torch_bert_ner_preprocessor') -class TorchBertNerPreprocessor(Component): +@register('torch_transformers_ner_preprocessor') +class TorchTransformersNerPreprocessor(Component): """Takes tokens and splits them into bert subtokens, encodes subtokens with their indices. Creates a mask of subtokens (one for the first subtoken, zero for the others). @@ -186,6 +186,7 @@ def __call__(self, f"length of sow_marker({len(sw_marker)}), tokens({len(sw_toks)})," \ f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \ f" for tokens = `{toks}` should match" + subword_tok_ids = zero_pad(subword_tok_ids, dtype=int, padding=0) startofword_markers = zero_pad(startofword_markers, dtype=int, padding=0) attention_mask = Mask()(subword_tokens) diff --git a/deeppavlov/models/torch_bert/torch_bert_sequence_tagger.py b/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py similarity index 94% rename from deeppavlov/models/torch_bert/torch_bert_sequence_tagger.py rename to deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py index 5a6859b042..7d9559daa5 100644 --- a/deeppavlov/models/torch_bert/torch_bert_sequence_tagger.py +++ b/deeppavlov/models/torch_bert/torch_transformers_sequence_tagger.py @@ -19,10 +19,10 @@ import numpy as np import torch from overrides import overrides -from transformers import BertForTokenClassification, BertConfig +from transformers import AutoModelForTokenClassification, AutoConfig -from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.torch_model import TorchModel @@ -192,8 +192,8 @@ def token_labels_to_subtoken_labels(labels, y_mask, input_mask): return subtoken_labels -@register('torch_bert_sequence_tagger') -class TorchBertSequenceTagger(TorchModel): +@register('torch_transformers_sequence_tagger') +class TorchTransformersSequenceTagger(TorchModel): """BERT-based model on PyTorch for text tagging. It predicts a label for every token (not subtoken) in the text. You can use it for sequence labeling tasks, such as morphological tagging or named entity recognition. @@ -276,7 +276,7 @@ def train_on_batch(self, b_labels = torch.from_numpy(np.array(subtoken_labels)).to(torch.int64).to(self.device) self.optimizer.zero_grad() - loss, logits = self.model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_masks, + loss, logits = self.model(input_ids=b_input_ids, attention_mask=b_input_masks, labels=b_labels) loss.backward() # Clip the norm of the gradients to 1.0. @@ -310,7 +310,7 @@ def __call__(self, with torch.no_grad(): # Forward pass, calculate logit predictions - logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks) + logits = self.model(b_input_ids, attention_mask=b_input_masks) # Move logits and labels to CPU and to numpy arrays logits = token_from_subtoken(logits[0].detach().cpu(), torch.from_numpy(y_masks)) @@ -331,18 +331,18 @@ def load(self, fname=None): if fname is not None: self.load_path = fname - if self.pretrained_bert and not Path(self.pretrained_bert).is_file(): - self.model = BertForTokenClassification.from_pretrained( - self.pretrained_bert, num_labels=self.n_classes, - output_attentions=False, output_hidden_states=False) + if self.pretrained_bert: + config = AutoConfig.from_pretrained(self.pretrained_bert, num_labels=self.n_classes, + output_attentions=False, output_hidden_states=False) + self.model = AutoModelForTokenClassification.from_pretrained(self.pretrained_bert, config=config) elif self.bert_config_file and Path(self.bert_config_file).is_file(): - self.bert_config = BertConfig.from_json_file(str(expand_path(self.bert_config_file))) + self.bert_config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob - self.model = BertForTokenClassification(config=self.bert_config) + self.model = AutoModelForTokenClassification(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") diff --git a/docs/apiref/models/torch_bert.rst b/docs/apiref/models/torch_bert.rst index caa2e02355..403d96b354 100644 --- a/docs/apiref/models/torch_bert.rst +++ b/docs/apiref/models/torch_bert.rst @@ -8,7 +8,7 @@ deeppavlov.models.torch_bert .. automethod:: __call__ -.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchBertNerPreprocessor +.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersNerPreprocessor .. automethod:: __call__ @@ -21,9 +21,7 @@ deeppavlov.models.torch_bert .. automethod:: __call__ .. automethod:: train_on_batch -.. autofunction:: deeppavlov.models.torch_bert.torch_bert_sequence_tagger.token_from_subtoken - -.. autoclass:: deeppavlov.models.torch_bert.torch_bert_sequence_tagger.TorchBertSequenceTagger +.. autoclass:: deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger .. automethod:: __call__ .. automethod:: train_on_batch diff --git a/docs/features/models/bert.rst b/docs/features/models/bert.rst index 37b19e0327..260c4be089 100644 --- a/docs/features/models/bert.rst +++ b/docs/features/models/bert.rst @@ -117,7 +117,7 @@ BERT for Named Entity Recognition (Sequence Tagging) Pre-trained BERT model can be used for sequence tagging. Examples of BERT application to sequence tagging can be found :doc:`here `. The modules used for tagging are :class:`~deeppavlov.models.bert.bert_sequence_tagger.BertSequenceTagger` on TensorFlow and -:class:`~deeppavlov.models.torch_bert.torch_bert_sequence_tagger.TorchBertSequenceTagger` on PyTorch. +:class:`~deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger` on PyTorch. The tags are obtained by applying a dense layer to the representation of the first subtoken of each word. There is also an optional CRF layer on the top for TensorFlow implementation. diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 16d50cd39c..420b46bd6b 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -188,7 +188,8 @@ ], ("ner/slotfill_simple_rasa_raw.json", "slotfill_simple_rasa_raw", ('IP')): [ ("i see 1 cat", ({"number": '1'},))], - ("ner/ner_conll2003_torch_bert.json", "ner_conll2003_torch_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK] + ("ner/ner_conll2003_torch_bert.json", "ner_conll2003_torch_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], + ("ner/ner_rus_bert_torch.json", "ner_rus_bert_torch", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK] }, "sentence_segmentation": { ("sentence_segmentation/sentseg_dailydialog.json", "sentseg_dailydialog", ('IP', 'TI')): [ diff --git a/utils/prepare/hashes.py b/utils/prepare/hashes.py index e021beb6a3..d122c637d7 100644 --- a/utils/prepare/hashes.py +++ b/utils/prepare/hashes.py @@ -18,7 +18,7 @@ import tarfile from hashlib import md5 from pathlib import Path -from typing import List, Dict, Union +from typing import Dict, Optional, Union from zipfile import ZipFile from deeppavlov.core.data.utils import file_md5 @@ -79,17 +79,10 @@ def compute_hashes(fpath: Union[str, Path]) -> Dict[str, str]: return hashes -def main(args: List[str] = None) -> None: - parser = argparse.ArgumentParser() - parser.add_argument("fname", help="path to a file to compute hash for", type=str) - parser.add_argument('-o', '--outfile', help='where to write the hashes', default=None, type=str) - - args = parser.parse_args(args) - - p = Path(args.fname).expanduser() +def main(fname: str, outfile: Optional[str] = None) -> None: + p = Path(fname).expanduser() hashes = compute_hashes(p) - outfile = args.outfile if outfile is None: outfile = p.with_suffix(p.suffix + '.md5').open('w', encoding='utf-8') elif outfile == '-': @@ -105,4 +98,9 @@ def main(args: List[str] = None) -> None: if __name__ == '__main__': - main() + parser = argparse.ArgumentParser() + parser.add_argument("fname", help="path to a file to compute hash for", type=str) + parser.add_argument('-o', '--outfile', help='where to write the hashes', default=None, type=str) + + args = parser.parse_args() + main(args.fname, args.outfile) diff --git a/utils/prepare/upload.py b/utils/prepare/upload.py new file mode 100644 index 0000000000..d488f5bc3b --- /dev/null +++ b/utils/prepare/upload.py @@ -0,0 +1,53 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil +import tarfile +from pathlib import Path + +from deeppavlov.core.commands.utils import parse_config +from deeppavlov.core.common.file import find_config +from hashes import main + + +def upload(config_in_file): + config_in = parse_config(config_in_file) + config_in_file = find_config(config_in_file) + + model_path = Path(config_in['metadata']['variables']['MODEL_PATH']).expanduser() + + model_name, class_name = config_in_file.stem, config_in_file.parent.name + + tmp_dir = f'/tmp/{class_name}' + tmp_tar = f'/tmp/{class_name}/{model_name}.tar.gz' + shutil.rmtree(tmp_dir, ignore_errors=True) + os.mkdir(tmp_dir) + + with tarfile.open(tmp_tar, "w:gz") as tar: + tar.add(model_path, arcname=model_name) + + main(tmp_tar) + + command = f'scp -r {tmp_dir} share.ipavlov.mipt.ru:/home/export/v1/' + donwload_url = f'http://files.deeppavlov.ai/v1/{class_name}/{model_name}.tar.gz' + print(command, donwload_url, sep='\n') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("config_in", help="path to a config", type=str) + args = parser.parse_args() + upload(args.config_in)