Skip to content

Commit

Permalink
Feat/convert ner (#1397)
Browse files Browse the repository at this point in the history
* Update config

* Add automodel

* Modify NER config

* Add Russian NER config

* Add upload script

* Add torch sequence tagger

* Modify sequence tagger

* refactor: small refactor

* fix: classes names in docs

* fix: token_from_subtoken path in docs

* Fix docs

* feat: updated requirements

Co-authored-by: Fedor Ignatov <[email protected]>
  • Loading branch information
vaskonov and IgnatovFedor authored May 13, 2021
1 parent 5fde446 commit c66cbbf
Show file tree
Hide file tree
Showing 11 changed files with 252 additions and 45 deletions.
21 changes: 11 additions & 10 deletions deeppavlov/configs/ner/ner_conll2003_torch_bert.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
],
"pipe": [
{
"class_name": "torch_bert_ner_preprocessor",
"vocab_file": "bert-base-cased",
"class_name": "torch_transformers_ner_preprocessor",
"vocab_file": "{TRANSFORMER}",
"do_lower_case": false,
"max_seq_length": 512,
"max_subword_length": 15,
Expand All @@ -41,8 +41,8 @@
"O"
],
"pad_with_zeros": true,
"save_path": "{NER_PATH}/tag.dict",
"load_path": "{NER_PATH}/tag.dict",
"save_path": "{MODEL_PATH}/tag.dict",
"load_path": "{MODEL_PATH}/tag.dict",
"fit_on": [
"y"
],
Expand All @@ -54,9 +54,9 @@
]
},
{
"class_name": "torch_bert_sequence_tagger",
"class_name": "torch_transformers_sequence_tagger",
"n_tags": "#tag_vocab.len",
"pretrained_bert": "bert-base-cased",
"pretrained_bert": "{TRANSFORMER}",
"attention_probs_keep_prob": 0.5,
"return_probas": false,
"encoder_layer_ids": [
Expand All @@ -77,8 +77,8 @@
"learning_rate_drop_patience": 30,
"learning_rate_drop_div": 1.5,
"load_before_drop": true,
"save_path": "{NER_PATH}/model",
"load_path": "{NER_PATH}/model",
"save_path": "{MODEL_PATH}/model",
"load_path": "{MODEL_PATH}/model",
"in": [
"x_subword_tok_ids",
"attention_mask",
Expand Down Expand Up @@ -142,11 +142,12 @@
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models",
"NER_PATH": "{MODELS_PATH}/ner_conll2003_torch_bert"
"TRANSFORMER": "bert-base-uncased",
"MODEL_PATH": "{MODELS_PATH}/ner_conll2003_torch_bert"
},
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/ner_conll2003_torch_bert_v0.tar.gz",
"url": "http://files.deeppavlov.ai/v1/ner/ner_conll2003_torch_bert.tar.gz",
"subdir": "{MODELS_PATH}"
}
]
Expand Down
155 changes: 155 additions & 0 deletions deeppavlov/configs/ner/ner_rus_bert_torch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
{
"dataset_reader": {
"class_name": "conll2003_reader",
"data_path": "{DOWNLOADS_PATH}/total_rus/",
"dataset_name": "collection_rus",
"provide_pos": false
},
"dataset_iterator": {
"class_name": "data_learning_iterator"
},
"chainer": {
"in": [
"x"
],
"in_y": [
"y"
],
"pipe": [
{
"class_name": "torch_transformers_ner_preprocessor",
"vocab_file": "{TRANSFORMER}",
"do_lower_case": false,
"max_seq_length": 512,
"max_subword_length": 15,
"token_masking_prob": 0.0,
"in": [
"x"
],
"out": [
"x_tokens",
"x_subword_tokens",
"x_subword_tok_ids",
"startofword_markers",
"attention_mask"
]
},
{
"id": "tag_vocab",
"class_name": "simple_vocab",
"unk_token": [
"O"
],
"pad_with_zeros": true,
"save_path": "{MODEL_PATH}/tag.dict",
"load_path": "{MODEL_PATH}/tag.dict",
"fit_on": [
"y"
],
"in": [
"y"
],
"out": [
"y_ind"
]
},
{
"class_name": "torch_transformers_sequence_tagger",
"n_tags": "#tag_vocab.len",
"pretrained_bert": "{TRANSFORMER}",
"attention_probs_keep_prob": 0.5,
"return_probas": false,
"encoder_layer_ids": [
-1
],
"optimizer": "AdamW",
"optimizer_parameters": {
"lr": 2e-5,
"weight_decay": 1e-6,
"betas": [
0.9,
0.999
],
"eps": 1e-6
},
"clip_norm": 1.0,
"min_learning_rate": 1e-7,
"learning_rate_drop_patience": 30,
"learning_rate_drop_div": 1.5,
"load_before_drop": true,
"save_path": "{MODEL_PATH}/model",
"load_path": "{MODEL_PATH}/model",
"in": [
"x_subword_tok_ids",
"attention_mask",
"startofword_markers"
],
"in_y": [
"y_ind"
],
"out": [
"y_pred_ind"
]
},
{
"ref": "tag_vocab",
"in": [
"y_pred_ind"
],
"out": [
"y_pred"
]
}
],
"out": [
"x_tokens",
"y_pred"
]
},
"train": {
"epochs": 30,
"batch_size": 10,
"metrics": [
{
"name": "ner_f1",
"inputs": [
"y",
"y_pred"
]
},
{
"name": "ner_token_f1",
"inputs": [
"y",
"y_pred"
]
}
],
"validation_patience": 100,
"val_every_n_batches": 20,
"log_every_n_batches": 20,
"show_examples": false,
"pytest_max_batches": 2,
"pytest_batch_size": 8,
"evaluation_targets": [
"valid",
"test"
],
"class_name": "torch_trainer"
},
"metadata": {
"variables": {
"ROOT_PATH": "~/.deeppavlov",
"DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
"MODELS_PATH": "{ROOT_PATH}/models",
"TRANSFORMER": "DeepPavlov/rubert-base-cased",
"MODEL_PATH": "{MODELS_PATH}/ner_rus_bert_torch"
},
"download": [
{
"url": "http://files.deeppavlov.ai/v1/ner/ner_rus_bert_torch.tar.gz",
"subdir": "{MODELS_PATH}"
}
]
}
}
4 changes: 2 additions & 2 deletions deeppavlov/core/common/registry.json
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,10 @@
"tfidf_ranker": "deeppavlov.models.doc_retrieval.tfidf_ranker:TfidfRanker",
"tfidf_weighted": "deeppavlov.models.embedders.tfidf_weighted_embedder:TfidfWeightedEmbedder",
"top1_elector": "deeppavlov.models.spelling_correction.electors.top1_elector:TopOneElector",
"torch_bert_ner_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchBertNerPreprocessor",
"torch_transformers_ner_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchTransformersNerPreprocessor",
"torch_bert_ranker": "deeppavlov.models.torch_bert.torch_bert_ranker:TorchBertRankerModel",
"torch_bert_ranker_preprocessor": "deeppavlov.models.preprocessors.torch_transformers_preprocessor:TorchBertRankerPreprocessor",
"torch_bert_sequence_tagger": "deeppavlov.models.torch_bert.torch_bert_sequence_tagger:TorchBertSequenceTagger",
"torch_transformers_sequence_tagger": "deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger",
"torch_squad_bert_infer": "deeppavlov.models.torch_bert.torch_bert_squad:TorchBertSQuADInferModel",
"torch_squad_bert_model": "deeppavlov.models.torch_bert.torch_bert_squad:TorchBertSQuADModel",
"torch_text_classification_model": "deeppavlov.models.classifiers.torch_classification_model:TorchTextClassificationModel",
Expand Down
4 changes: 2 additions & 2 deletions deeppavlov/core/common/requirements_registry.json
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@
"{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
"{DEEPPAVLOV_PATH}/requirements/transformers.txt"
],
"torch_bert_sequence_tagger": [
"torch_transformers_sequence_tagger": [
"{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
"{DEEPPAVLOV_PATH}/requirements/transformers.txt"
],
Expand All @@ -247,7 +247,7 @@
"{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
"{DEEPPAVLOV_PATH}/requirements/transformers.txt"
],
"torch_bert_ner_preprocessor": [
"torch_transformers_ner_preprocessor": [
"{DEEPPAVLOV_PATH}/requirements/pytorch16.txt",
"{DEEPPAVLOV_PATH}/requirements/transformers.txt"
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ def __call__(self, texts_a: List[str], texts_b: Optional[List[str]] = None) -> U
return input_features


@register('torch_bert_ner_preprocessor')
class TorchBertNerPreprocessor(Component):
@register('torch_transformers_ner_preprocessor')
class TorchTransformersNerPreprocessor(Component):
"""Takes tokens and splits them into bert subtokens, encodes subtokens with their indices.
Creates a mask of subtokens (one for the first subtoken, zero for the others).
Expand Down Expand Up @@ -186,6 +186,7 @@ def __call__(self,
f"length of sow_marker({len(sw_marker)}), tokens({len(sw_toks)})," \
f" token ids({len(subword_tok_ids[-1])}) and ys({len(ys)})" \
f" for tokens = `{toks}` should match"

subword_tok_ids = zero_pad(subword_tok_ids, dtype=int, padding=0)
startofword_markers = zero_pad(startofword_markers, dtype=int, padding=0)
attention_mask = Mask()(subword_tokens)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
import numpy as np
import torch
from overrides import overrides
from transformers import BertForTokenClassification, BertConfig
from transformers import AutoModelForTokenClassification, AutoConfig

from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.torch_model import TorchModel

Expand Down Expand Up @@ -192,8 +192,8 @@ def token_labels_to_subtoken_labels(labels, y_mask, input_mask):
return subtoken_labels


@register('torch_bert_sequence_tagger')
class TorchBertSequenceTagger(TorchModel):
@register('torch_transformers_sequence_tagger')
class TorchTransformersSequenceTagger(TorchModel):
"""BERT-based model on PyTorch for text tagging. It predicts a label for every token (not subtoken) in the text.
You can use it for sequence labeling tasks, such as morphological tagging or named entity recognition.
Expand Down Expand Up @@ -276,7 +276,7 @@ def train_on_batch(self,
b_labels = torch.from_numpy(np.array(subtoken_labels)).to(torch.int64).to(self.device)
self.optimizer.zero_grad()

loss, logits = self.model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_masks,
loss, logits = self.model(input_ids=b_input_ids, attention_mask=b_input_masks,
labels=b_labels)
loss.backward()
# Clip the norm of the gradients to 1.0.
Expand Down Expand Up @@ -310,7 +310,7 @@ def __call__(self,

with torch.no_grad():
# Forward pass, calculate logit predictions
logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks)
logits = self.model(b_input_ids, attention_mask=b_input_masks)

# Move logits and labels to CPU and to numpy arrays
logits = token_from_subtoken(logits[0].detach().cpu(), torch.from_numpy(y_masks))
Expand All @@ -331,18 +331,18 @@ def load(self, fname=None):
if fname is not None:
self.load_path = fname

if self.pretrained_bert and not Path(self.pretrained_bert).is_file():
self.model = BertForTokenClassification.from_pretrained(
self.pretrained_bert, num_labels=self.n_classes,
output_attentions=False, output_hidden_states=False)
if self.pretrained_bert:
config = AutoConfig.from_pretrained(self.pretrained_bert, num_labels=self.n_classes,
output_attentions=False, output_hidden_states=False)
self.model = AutoModelForTokenClassification.from_pretrained(self.pretrained_bert, config=config)
elif self.bert_config_file and Path(self.bert_config_file).is_file():
self.bert_config = BertConfig.from_json_file(str(expand_path(self.bert_config_file)))
self.bert_config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file)))

if self.attention_probs_keep_prob is not None:
self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob
if self.hidden_keep_prob is not None:
self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob
self.model = BertForTokenClassification(config=self.bert_config)
self.model = AutoModelForTokenClassification(config=self.bert_config)
else:
raise ConfigError("No pre-trained BERT model is given.")

Expand Down
6 changes: 2 additions & 4 deletions docs/apiref/models/torch_bert.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ deeppavlov.models.torch_bert

.. automethod:: __call__

.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchBertNerPreprocessor
.. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersNerPreprocessor

.. automethod:: __call__

Expand All @@ -21,9 +21,7 @@ deeppavlov.models.torch_bert
.. automethod:: __call__
.. automethod:: train_on_batch

.. autofunction:: deeppavlov.models.torch_bert.torch_bert_sequence_tagger.token_from_subtoken

.. autoclass:: deeppavlov.models.torch_bert.torch_bert_sequence_tagger.TorchBertSequenceTagger
.. autoclass:: deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger

.. automethod:: __call__
.. automethod:: train_on_batch
Expand Down
2 changes: 1 addition & 1 deletion docs/features/models/bert.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ BERT for Named Entity Recognition (Sequence Tagging)
Pre-trained BERT model can be used for sequence tagging. Examples of BERT application to sequence tagging
can be found :doc:`here </features/models/ner>`. The modules used for tagging
are :class:`~deeppavlov.models.bert.bert_sequence_tagger.BertSequenceTagger` on TensorFlow and
:class:`~deeppavlov.models.torch_bert.torch_bert_sequence_tagger.TorchBertSequenceTagger` on PyTorch.
:class:`~deeppavlov.models.torch_bert.torch_transformers_sequence_tagger:TorchTransformersSequenceTagger` on PyTorch.
The tags are obtained by applying a dense layer to the representation of
the first subtoken of each word. There is also an optional CRF layer on the top for TensorFlow implementation.

Expand Down
3 changes: 2 additions & 1 deletion tests/test_quick_start.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@
],
("ner/slotfill_simple_rasa_raw.json", "slotfill_simple_rasa_raw", ('IP')): [
("i see 1 cat", ({"number": '1'},))],
("ner/ner_conll2003_torch_bert.json", "ner_conll2003_torch_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK]
("ner/ner_conll2003_torch_bert.json", "ner_conll2003_torch_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK],
("ner/ner_rus_bert_torch.json", "ner_rus_bert_torch", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK]
},
"sentence_segmentation": {
("sentence_segmentation/sentseg_dailydialog.json", "sentseg_dailydialog", ('IP', 'TI')): [
Expand Down
Loading

0 comments on commit c66cbbf

Please sign in to comment.