-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
91003b5
commit e376fe3
Showing
76 changed files
with
5,358 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import argparse | ||
import torch | ||
import uer.trainer as trainer | ||
from uer.utils.config import load_hyperparam | ||
from uer.opts import * | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
|
||
# Path options. | ||
parser.add_argument("--dataset_path", type=str, default="dataset.pt", | ||
help="Path of the preprocessed dataset.") | ||
parser.add_argument("--vocab_path", default=None, type=str, | ||
help="Path of the vocabulary file.") | ||
parser.add_argument("--spm_model_path", default=None, type=str, | ||
help="Path of the sentence piece model.") | ||
parser.add_argument("--tgt_vocab_path", default=None, type=str, | ||
help="Path of the target vocabulary file.") | ||
parser.add_argument("--tgt_spm_model_path", default=None, type=str, | ||
help="Path of the target sentence piece model.") | ||
parser.add_argument("--pretrained_model_path", type=str, default=None, | ||
help="Path of the pretrained model.") | ||
parser.add_argument("--output_model_path", type=str, required=True, | ||
help="Path of the output model.") | ||
parser.add_argument("--config_path", type=str, default="models/bert/base_config.json", | ||
help="Config file of model hyper-parameters.") | ||
|
||
# Training and saving options. | ||
parser.add_argument("--total_steps", type=int, default=100000, | ||
help="Total training steps.") | ||
parser.add_argument("--save_checkpoint_steps", type=int, default=10000, | ||
help="Specific steps to save model checkpoint.") | ||
parser.add_argument("--report_steps", type=int, default=100, | ||
help="Specific steps to print prompt.") | ||
parser.add_argument("--accumulation_steps", type=int, default=1, | ||
help="Specific steps to accumulate gradient.") | ||
parser.add_argument("--batch_size", type=int, default=32, | ||
help="Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps].") | ||
parser.add_argument("--instances_buffer_size", type=int, default=25600, | ||
help="The buffer size of instances in memory.") | ||
parser.add_argument("--labels_num", type=int, required=False, | ||
help="Number of prediction labels.") | ||
parser.add_argument("--dropout", type=float, default=0.1, help="Dropout value.") | ||
parser.add_argument("--seed", type=int, default=7, help="Random seed.") | ||
|
||
# Preprocess options. | ||
parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert", | ||
help="Specify the tokenizer." | ||
"Original Google BERT uses bert tokenizer on Chinese corpus." | ||
"Char tokenizer segments sentences into characters." | ||
"Space tokenizer segments sentences into words according to space." | ||
) | ||
|
||
# Model options. | ||
model_opts(parser) | ||
parser.add_argument("--tgt_embedding", choices=["word", "word_pos", "word_pos_seg", "word_sinusoidalpos"], default="word_pos_seg", | ||
help="Target embedding type.") | ||
parser.add_argument("--decoder", choices=["transformer"], default="transformer", help="Decoder type.") | ||
parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", | ||
help="Pooling type.") | ||
parser.add_argument("--target", choices=["bert", "lm", "mlm", "bilm", "albert", "seq2seq", "t5", "cls", "prefixlm"], default="bert", | ||
help="The training target of the pretraining model.") | ||
parser.add_argument("--tie_weights", action="store_true", | ||
help="Tie the word embedding and softmax weights.") | ||
parser.add_argument("--has_lmtarget_bias", action="store_true", | ||
help="Add bias on output_layer for lm target.") | ||
|
||
# Masking options. | ||
parser.add_argument("--whole_word_masking", action="store_true", help="Whole word masking.") | ||
parser.add_argument("--span_masking", action="store_true", help="Span masking.") | ||
parser.add_argument("--span_geo_prob", type=float, default=0.2, | ||
help="Hyperparameter of geometric distribution for span masking.") | ||
parser.add_argument("--span_max_length", type=int, default=10, | ||
help="Max length for span masking.") | ||
|
||
# Optimizer options. | ||
optimization_opts(parser) | ||
|
||
# GPU options. | ||
parser.add_argument("--world_size", type=int, default=1, help="Total number of processes (GPUs) for training.") | ||
parser.add_argument("--gpu_ranks", default=[], nargs='+', type=int, help="List of ranks of each process." | ||
" Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU.") | ||
parser.add_argument("--master_ip", default="tcp://localhost:12345", type=str, help="IP-Port of master for training.") | ||
parser.add_argument("--backend", choices=["nccl", "gloo"], default="nccl", type=str, help="Distributed backend.") | ||
|
||
args = parser.parse_args() | ||
|
||
if args.target == "cls": | ||
assert args.labels_num is not None, "Cls target needs the denotation of the number of labels." | ||
|
||
# Load hyper-parameters from config file. | ||
if args.config_path: | ||
load_hyperparam(args) | ||
|
||
ranks_num = len(args.gpu_ranks) | ||
|
||
if args.world_size > 1: | ||
# Multiprocessing distributed mode. | ||
assert torch.cuda.is_available(), "No available GPUs." | ||
assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit." | ||
assert ranks_num <= torch.cuda.device_count(), "Started processes exceeds the available GPUs." | ||
args.dist_train = True | ||
args.ranks_num = ranks_num | ||
print("Using distributed mode for training.") | ||
elif args.world_size == 1 and ranks_num == 1: | ||
# Single GPU mode. | ||
assert torch.cuda.is_available(), "No available GPUs." | ||
args.gpu_id = args.gpu_ranks[0] | ||
assert args.gpu_id < torch.cuda.device_count(), "Invalid specified GPU device." | ||
args.dist_train = False | ||
args.single_gpu = True | ||
print("Using GPU %d for training." % args.gpu_id) | ||
else: | ||
# CPU mode. | ||
assert ranks_num == 0, "GPUs are specified, please check the arguments." | ||
args.dist_train = False | ||
args.single_gpu = False | ||
print("Using CPU mode for training.") | ||
|
||
trainer.train_and_validate(args) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#!/usr/bin/python3 | ||
#-*- coding:utf-8 -*- | ||
import argparse | ||
import six | ||
from packaging import version | ||
from uer.utils.data import * | ||
from uer.utils import * | ||
|
||
|
||
assert version.parse(six.__version__) >= version.parse("1.12.0") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
|
||
# Path options. | ||
parser.add_argument("--corpus_path", type=str, required=True, | ||
help="Path of the corpus for pretraining.") | ||
parser.add_argument("--vocab_path", default=None, type=str, | ||
help="Path of the vocabulary file.") | ||
parser.add_argument("--spm_model_path", default=None, type=str, | ||
help="Path of the sentence piece model.") | ||
parser.add_argument("--tgt_vocab_path", default=None, type=str, | ||
help="Path of the target vocabulary file.") | ||
parser.add_argument("--tgt_spm_model_path", default=None, type=str, | ||
help="Path of the target sentence piece model.") | ||
parser.add_argument("--dataset_path", type=str, default="dataset.pt", | ||
help="Path of the preprocessed dataset.") | ||
|
||
# Preprocess options. | ||
parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert", | ||
help="Specify the tokenizer." | ||
"Original Google BERT uses bert tokenizer on Chinese corpus." | ||
"Char tokenizer segments sentences into characters." | ||
"Space tokenizer segments sentences into words according to space." | ||
) | ||
parser.add_argument("--tgt_tokenizer", choices=["bert", "char", "space"], default="bert", | ||
help="Specify the tokenizer.") | ||
parser.add_argument("--processes_num", type=int, default=1, | ||
help="Split the whole dataset into `processes_num` parts, " | ||
"and each part is fed to a single process in training step.") | ||
parser.add_argument("--target", choices=["bert", "lm", "mlm", "bilm", "albert", "seq2seq", "t5", "cls", "prefixlm"], default="bert", | ||
help="The training target of the pretraining model.") | ||
parser.add_argument("--docs_buffer_size", type=int, default=100000, | ||
help="The buffer size of documents in memory, specific to targets that require negative sampling.") | ||
parser.add_argument("--seq_length", type=int, default=128, help="Sequence length of instances.") | ||
parser.add_argument("--tgt_seq_length", type=int, default=128, help="Target sequence length of instances.") | ||
parser.add_argument("--dup_factor", type=int, default=5, | ||
help="Duplicate instances multiple times.") | ||
parser.add_argument("--short_seq_prob", type=float, default=0.1, | ||
help="Probability of truncating sequence." | ||
"The larger value, the higher probability of using short (truncated) sequence.") | ||
parser.add_argument("--full_sentences", action="store_true", help="Full sentences.") | ||
parser.add_argument("--seed", type=int, default=7, help="Random seed.") | ||
|
||
# Masking options. | ||
parser.add_argument("--dynamic_masking", action="store_true", help="Dynamic masking.") | ||
parser.add_argument("--whole_word_masking", action="store_true", help="Whole word masking.") | ||
parser.add_argument("--span_masking", action="store_true", help="Span masking.") | ||
parser.add_argument("--span_geo_prob", type=float, default=0.2, | ||
help="Hyperparameter of geometric distribution for span masking.") | ||
parser.add_argument("--span_max_length", type=int, default=10, | ||
help="Max length for span masking.") | ||
|
||
args = parser.parse_args() | ||
|
||
# Dynamic masking. | ||
if args.dynamic_masking: | ||
args.dup_factor = 1 | ||
|
||
# Build tokenizer. | ||
tokenizer = str2tokenizer[args.tokenizer](args) | ||
if args.target == "seq2seq": | ||
args.tgt_tokenizer = str2tokenizer[args.tgt_tokenizer](args, False) | ||
|
||
# Build and save dataset. | ||
dataset = str2dataset[args.target](args, tokenizer.vocab, tokenizer) | ||
dataset.build_and_save(args.processes_num) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
torch>=1.0 | ||
argparse | ||
six>=1.12.0 | ||
packaging |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from uer.decoders.transformer_decoder import TransformerDecoder | ||
|
||
|
||
str2decoder = {"transformer": TransformerDecoder} | ||
|
||
__all__ = ["TransformerDecoder", "str2decoder"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import torch | ||
import torch.nn as nn | ||
from uer.layers import * | ||
from uer.layers.transformer import TransformerDecoderLayer | ||
from uer.layers.layer_norm import LayerNorm, T5LayerNorm | ||
from uer.layers.relative_position_embedding import RelativePositionEmbedding | ||
|
||
|
||
class TransformerDecoder(nn.Module): | ||
""" | ||
BERT encoder exploits 12 or 24 transformer layers to extract features. | ||
""" | ||
def __init__(self, args): | ||
super(TransformerDecoder, self).__init__() | ||
self.layers_num = args.layers_num | ||
self.layernorm_positioning = args.layernorm_positioning | ||
self.relative_position_embedding = args.relative_position_embedding | ||
self.share_relative_position_embedding = args.share_relative_position_embedding | ||
self.transformer_decoder = nn.ModuleList( | ||
[TransformerDecoderLayer(args) for _ in range(self.layers_num)] | ||
) | ||
|
||
has_bias = bool(1 - args.remove_transformer_bias) | ||
|
||
if self.layernorm_positioning == "pre": | ||
if args.layernorm == "t5": | ||
self.layer_norm = T5LayerNorm(args.hidden_size) | ||
else: | ||
self.layer_norm = LayerNorm(args.hidden_size) | ||
|
||
if self.relative_position_embedding: | ||
self.self_pos_emb = RelativePositionEmbedding(bidirectional=False, heads_num=args.heads_num, | ||
num_buckets=args.relative_attention_buckets_num) | ||
if self.share_relative_position_embedding: | ||
self.context_pos_emb = self.self_pos_emb | ||
else: | ||
self.context_pos_emb = RelativePositionEmbedding(bidirectional=False, heads_num=args.heads_num, | ||
num_buckets=args.relative_attention_buckets_num) | ||
|
||
|
||
def forward(self, memory_bank, emb, additional_info): | ||
""" | ||
Args: | ||
memory_bank: [batch_size x seq_length x emb_size] | ||
emb: [batch_size x seq_length x emb_size] | ||
Returns: | ||
hidden: [batch_size x seq_length x hidden_size] | ||
""" | ||
_, src_seq_length, _ = memory_bank.size() | ||
batch_size, tgt_seq_length, _ = emb.size() | ||
|
||
mask_encoder = (additional_info[0] > 0). \ | ||
unsqueeze(1). \ | ||
repeat(1, tgt_seq_length, 1). \ | ||
unsqueeze(1) | ||
mask_encoder = mask_encoder.float() | ||
mask_encoder = (1.0 - mask_encoder) * -10000.0 | ||
|
||
mask_decoder = torch.ones(tgt_seq_length, tgt_seq_length, device=emb.device) | ||
mask_decoder = torch.tril(mask_decoder) | ||
mask_decoder = (1.0 - mask_decoder) * -10000 | ||
mask_decoder = mask_decoder.repeat(batch_size, 1, 1, 1) | ||
|
||
hidden = emb | ||
|
||
if self.relative_position_embedding: | ||
self_position_bias = self.self_pos_emb(hidden, hidden) | ||
context_position_bias = self.context_pos_emb(hidden, memory_bank) | ||
else: | ||
self_position_bias = None | ||
context_position_bias = None | ||
|
||
for i in range(self.layers_num): | ||
hidden = self.transformer_decoder[i](hidden, memory_bank, mask_decoder, mask_encoder, self_position_bias, context_position_bias) | ||
|
||
if self.layernorm_positioning == "pre": | ||
return self.layer_norm(hidden) | ||
else: | ||
return hidden | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from uer.encoders.transformer_encoder import TransformerEncoder | ||
from uer.encoders.rnn_encoder import RnnEncoder | ||
from uer.encoders.rnn_encoder import LstmEncoder | ||
from uer.encoders.rnn_encoder import GruEncoder | ||
from uer.encoders.rnn_encoder import BirnnEncoder | ||
from uer.encoders.rnn_encoder import BilstmEncoder | ||
from uer.encoders.rnn_encoder import BigruEncoder | ||
from uer.encoders.cnn_encoder import GatedcnnEncoder | ||
|
||
|
||
str2encoder = {"transformer": TransformerEncoder, "rnn": RnnEncoder, "lstm": LstmEncoder, | ||
"gru": GruEncoder, "birnn": BirnnEncoder, "bilstm": BilstmEncoder, "bigru": BigruEncoder, | ||
"gatedcnn": GatedcnnEncoder} | ||
|
||
__all__ = ["TransformerEncoder", "RnnEncoder", "LstmEncoder", "GruEncoder", "BirnnEncoder", | ||
"BilstmEncoder", "BigruEncoder", "GatedcnnEncoder", "str2encoder"] | ||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import torch | ||
import torch.nn as nn | ||
|
||
|
||
class GatedcnnEncoder(nn.Module): | ||
def __init__(self, args): | ||
super(GatedcnnEncoder, self).__init__() | ||
self.layers_num = args.layers_num | ||
self.kernel_size = args.kernel_size | ||
self.block_size = args.block_size | ||
self.emb_size = args.emb_size | ||
self.hidden_size = args.hidden_size | ||
|
||
self.conv_1 = nn.Conv2d(1, args.hidden_size, (args.kernel_size, args.emb_size)) | ||
self.gate_1 = nn.Conv2d(1, args.hidden_size, (args.kernel_size, args.emb_size)) | ||
|
||
self.conv_b1 = nn.Parameter(torch.randn(1, args.hidden_size, 1, 1)) | ||
self.gate_b1 = nn.Parameter(torch.randn(1, args.hidden_size, 1, 1)) | ||
|
||
self.conv = nn.ModuleList( | ||
[ | ||
nn.Conv2d(args.hidden_size, args.hidden_size, (args.kernel_size, 1)) | ||
for _ in range(args.layers_num - 1) | ||
] | ||
) | ||
self.gate = nn.ModuleList( | ||
[ | ||
nn.Conv2d(args.hidden_size, args.hidden_size, (args.kernel_size, 1)) | ||
for _ in range(args.layers_num - 1) | ||
] | ||
) | ||
|
||
self.conv_b = nn.ParameterList( | ||
nn.Parameter(torch.randn(1, args.hidden_size, 1, 1)) | ||
for _ in range(args.layers_num - 1) | ||
) | ||
self.gate_b = nn.ParameterList( | ||
nn.Parameter(torch.randn(1, args.hidden_size, 1, 1)) | ||
for _ in range(args.layers_num - 1) | ||
) | ||
|
||
def forward(self, emb, seg): | ||
batch_size, seq_length, _ = emb.size() | ||
|
||
padding = torch.zeros([batch_size, self.kernel_size-1, self.emb_size]).to(emb.device) | ||
emb = torch.cat([padding, emb], dim=1).unsqueeze(1) # batch_size, 1, seq_length+width-1, emb_size | ||
|
||
hidden = self.conv_1(emb) | ||
hidden += self.conv_b1.repeat(1, 1, seq_length, 1) | ||
gate = self.gate_1(emb) | ||
gate += self.gate_b1.repeat(1, 1, seq_length, 1) | ||
hidden = hidden * torch.sigmoid(gate) | ||
|
||
res_input = hidden | ||
|
||
padding = torch.zeros([batch_size, self.hidden_size, self.kernel_size-1, 1]).to(emb.device) | ||
hidden = torch.cat([padding, hidden], dim=2) | ||
|
||
for i, (conv_i, gate_i) in enumerate(zip(self.conv, self.gate)): | ||
hidden, gate = conv_i(hidden), gate_i(hidden) | ||
hidden += self.conv_b[i].repeat(1, 1, seq_length, 1) | ||
gate += self.gate_b[i].repeat(1, 1, seq_length, 1) | ||
hidden = hidden * torch.sigmoid(gate) | ||
if (i + 1) % self.block_size == 0: | ||
hidden = hidden + res_input | ||
res_input = hidden | ||
hidden = torch.cat([padding, hidden], dim=2) | ||
|
||
hidden = hidden[:, :, self.kernel_size - 1:, :] | ||
output = hidden.transpose(1, 2).contiguous().view(batch_size, seq_length, self.hidden_size) | ||
|
||
return output |
Oops, something went wrong.