Skip to content

Commit

Permalink
Initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
linwhitehat committed Feb 9, 2022
1 parent 91003b5 commit e376fe3
Show file tree
Hide file tree
Showing 76 changed files with 5,358 additions and 0 deletions.
386 changes: 386 additions & 0 deletions fine-tuning/run_classifier.py

Large diffs are not rendered by default.

125 changes: 125 additions & 0 deletions pre-training/pretrain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import argparse
import torch
import uer.trainer as trainer
from uer.utils.config import load_hyperparam
from uer.opts import *


def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Path options.
parser.add_argument("--dataset_path", type=str, default="dataset.pt",
help="Path of the preprocessed dataset.")
parser.add_argument("--vocab_path", default=None, type=str,
help="Path of the vocabulary file.")
parser.add_argument("--spm_model_path", default=None, type=str,
help="Path of the sentence piece model.")
parser.add_argument("--tgt_vocab_path", default=None, type=str,
help="Path of the target vocabulary file.")
parser.add_argument("--tgt_spm_model_path", default=None, type=str,
help="Path of the target sentence piece model.")
parser.add_argument("--pretrained_model_path", type=str, default=None,
help="Path of the pretrained model.")
parser.add_argument("--output_model_path", type=str, required=True,
help="Path of the output model.")
parser.add_argument("--config_path", type=str, default="models/bert/base_config.json",
help="Config file of model hyper-parameters.")

# Training and saving options.
parser.add_argument("--total_steps", type=int, default=100000,
help="Total training steps.")
parser.add_argument("--save_checkpoint_steps", type=int, default=10000,
help="Specific steps to save model checkpoint.")
parser.add_argument("--report_steps", type=int, default=100,
help="Specific steps to print prompt.")
parser.add_argument("--accumulation_steps", type=int, default=1,
help="Specific steps to accumulate gradient.")
parser.add_argument("--batch_size", type=int, default=32,
help="Training batch size. The actual batch_size is [batch_size x world_size x accumulation_steps].")
parser.add_argument("--instances_buffer_size", type=int, default=25600,
help="The buffer size of instances in memory.")
parser.add_argument("--labels_num", type=int, required=False,
help="Number of prediction labels.")
parser.add_argument("--dropout", type=float, default=0.1, help="Dropout value.")
parser.add_argument("--seed", type=int, default=7, help="Random seed.")

# Preprocess options.
parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
help="Specify the tokenizer."
"Original Google BERT uses bert tokenizer on Chinese corpus."
"Char tokenizer segments sentences into characters."
"Space tokenizer segments sentences into words according to space."
)

# Model options.
model_opts(parser)
parser.add_argument("--tgt_embedding", choices=["word", "word_pos", "word_pos_seg", "word_sinusoidalpos"], default="word_pos_seg",
help="Target embedding type.")
parser.add_argument("--decoder", choices=["transformer"], default="transformer", help="Decoder type.")
parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
help="Pooling type.")
parser.add_argument("--target", choices=["bert", "lm", "mlm", "bilm", "albert", "seq2seq", "t5", "cls", "prefixlm"], default="bert",
help="The training target of the pretraining model.")
parser.add_argument("--tie_weights", action="store_true",
help="Tie the word embedding and softmax weights.")
parser.add_argument("--has_lmtarget_bias", action="store_true",
help="Add bias on output_layer for lm target.")

# Masking options.
parser.add_argument("--whole_word_masking", action="store_true", help="Whole word masking.")
parser.add_argument("--span_masking", action="store_true", help="Span masking.")
parser.add_argument("--span_geo_prob", type=float, default=0.2,
help="Hyperparameter of geometric distribution for span masking.")
parser.add_argument("--span_max_length", type=int, default=10,
help="Max length for span masking.")

# Optimizer options.
optimization_opts(parser)

# GPU options.
parser.add_argument("--world_size", type=int, default=1, help="Total number of processes (GPUs) for training.")
parser.add_argument("--gpu_ranks", default=[], nargs='+', type=int, help="List of ranks of each process."
" Each process has a unique integer rank whose value is in the interval [0, world_size), and runs in a single GPU.")
parser.add_argument("--master_ip", default="tcp://localhost:12345", type=str, help="IP-Port of master for training.")
parser.add_argument("--backend", choices=["nccl", "gloo"], default="nccl", type=str, help="Distributed backend.")

args = parser.parse_args()

if args.target == "cls":
assert args.labels_num is not None, "Cls target needs the denotation of the number of labels."

# Load hyper-parameters from config file.
if args.config_path:
load_hyperparam(args)

ranks_num = len(args.gpu_ranks)

if args.world_size > 1:
# Multiprocessing distributed mode.
assert torch.cuda.is_available(), "No available GPUs."
assert ranks_num <= args.world_size, "Started processes exceed `world_size` upper limit."
assert ranks_num <= torch.cuda.device_count(), "Started processes exceeds the available GPUs."
args.dist_train = True
args.ranks_num = ranks_num
print("Using distributed mode for training.")
elif args.world_size == 1 and ranks_num == 1:
# Single GPU mode.
assert torch.cuda.is_available(), "No available GPUs."
args.gpu_id = args.gpu_ranks[0]
assert args.gpu_id < torch.cuda.device_count(), "Invalid specified GPU device."
args.dist_train = False
args.single_gpu = True
print("Using GPU %d for training." % args.gpu_id)
else:
# CPU mode.
assert ranks_num == 0, "GPUs are specified, please check the arguments."
args.dist_train = False
args.single_gpu = False
print("Using CPU mode for training.")

trainer.train_and_validate(args)


if __name__ == "__main__":
main()
82 changes: 82 additions & 0 deletions preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/python3
#-*- coding:utf-8 -*-
import argparse
import six
from packaging import version
from uer.utils.data import *
from uer.utils import *


assert version.parse(six.__version__) >= version.parse("1.12.0")


def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Path options.
parser.add_argument("--corpus_path", type=str, required=True,
help="Path of the corpus for pretraining.")
parser.add_argument("--vocab_path", default=None, type=str,
help="Path of the vocabulary file.")
parser.add_argument("--spm_model_path", default=None, type=str,
help="Path of the sentence piece model.")
parser.add_argument("--tgt_vocab_path", default=None, type=str,
help="Path of the target vocabulary file.")
parser.add_argument("--tgt_spm_model_path", default=None, type=str,
help="Path of the target sentence piece model.")
parser.add_argument("--dataset_path", type=str, default="dataset.pt",
help="Path of the preprocessed dataset.")

# Preprocess options.
parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
help="Specify the tokenizer."
"Original Google BERT uses bert tokenizer on Chinese corpus."
"Char tokenizer segments sentences into characters."
"Space tokenizer segments sentences into words according to space."
)
parser.add_argument("--tgt_tokenizer", choices=["bert", "char", "space"], default="bert",
help="Specify the tokenizer.")
parser.add_argument("--processes_num", type=int, default=1,
help="Split the whole dataset into `processes_num` parts, "
"and each part is fed to a single process in training step.")
parser.add_argument("--target", choices=["bert", "lm", "mlm", "bilm", "albert", "seq2seq", "t5", "cls", "prefixlm"], default="bert",
help="The training target of the pretraining model.")
parser.add_argument("--docs_buffer_size", type=int, default=100000,
help="The buffer size of documents in memory, specific to targets that require negative sampling.")
parser.add_argument("--seq_length", type=int, default=128, help="Sequence length of instances.")
parser.add_argument("--tgt_seq_length", type=int, default=128, help="Target sequence length of instances.")
parser.add_argument("--dup_factor", type=int, default=5,
help="Duplicate instances multiple times.")
parser.add_argument("--short_seq_prob", type=float, default=0.1,
help="Probability of truncating sequence."
"The larger value, the higher probability of using short (truncated) sequence.")
parser.add_argument("--full_sentences", action="store_true", help="Full sentences.")
parser.add_argument("--seed", type=int, default=7, help="Random seed.")

# Masking options.
parser.add_argument("--dynamic_masking", action="store_true", help="Dynamic masking.")
parser.add_argument("--whole_word_masking", action="store_true", help="Whole word masking.")
parser.add_argument("--span_masking", action="store_true", help="Span masking.")
parser.add_argument("--span_geo_prob", type=float, default=0.2,
help="Hyperparameter of geometric distribution for span masking.")
parser.add_argument("--span_max_length", type=int, default=10,
help="Max length for span masking.")

args = parser.parse_args()

# Dynamic masking.
if args.dynamic_masking:
args.dup_factor = 1

# Build tokenizer.
tokenizer = str2tokenizer[args.tokenizer](args)
if args.target == "seq2seq":
args.tgt_tokenizer = str2tokenizer[args.tgt_tokenizer](args, False)

# Build and save dataset.
dataset = str2dataset[args.target](args, tokenizer.vocab, tokenizer)
dataset.build_and_save(args.processes_num)


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
torch>=1.0
argparse
six>=1.12.0
packaging
Empty file added uer/__init__.py
Empty file.
Binary file added uer/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
Binary file added uer/__pycache__/model_loader.cpython-38.pyc
Binary file not shown.
Binary file added uer/__pycache__/model_saver.cpython-38.pyc
Binary file not shown.
Binary file added uer/__pycache__/opts.cpython-38.pyc
Binary file not shown.
7 changes: 7 additions & 0 deletions uer/decoders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from uer.decoders.transformer_decoder import TransformerDecoder


str2decoder = {"transformer": TransformerDecoder}

__all__ = ["TransformerDecoder", "str2decoder"]

80 changes: 80 additions & 0 deletions uer/decoders/transformer_decoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import torch
import torch.nn as nn
from uer.layers import *
from uer.layers.transformer import TransformerDecoderLayer
from uer.layers.layer_norm import LayerNorm, T5LayerNorm
from uer.layers.relative_position_embedding import RelativePositionEmbedding


class TransformerDecoder(nn.Module):
"""
BERT encoder exploits 12 or 24 transformer layers to extract features.
"""
def __init__(self, args):
super(TransformerDecoder, self).__init__()
self.layers_num = args.layers_num
self.layernorm_positioning = args.layernorm_positioning
self.relative_position_embedding = args.relative_position_embedding
self.share_relative_position_embedding = args.share_relative_position_embedding
self.transformer_decoder = nn.ModuleList(
[TransformerDecoderLayer(args) for _ in range(self.layers_num)]
)

has_bias = bool(1 - args.remove_transformer_bias)

if self.layernorm_positioning == "pre":
if args.layernorm == "t5":
self.layer_norm = T5LayerNorm(args.hidden_size)
else:
self.layer_norm = LayerNorm(args.hidden_size)

if self.relative_position_embedding:
self.self_pos_emb = RelativePositionEmbedding(bidirectional=False, heads_num=args.heads_num,
num_buckets=args.relative_attention_buckets_num)
if self.share_relative_position_embedding:
self.context_pos_emb = self.self_pos_emb
else:
self.context_pos_emb = RelativePositionEmbedding(bidirectional=False, heads_num=args.heads_num,
num_buckets=args.relative_attention_buckets_num)


def forward(self, memory_bank, emb, additional_info):
"""
Args:
memory_bank: [batch_size x seq_length x emb_size]
emb: [batch_size x seq_length x emb_size]
Returns:
hidden: [batch_size x seq_length x hidden_size]
"""
_, src_seq_length, _ = memory_bank.size()
batch_size, tgt_seq_length, _ = emb.size()

mask_encoder = (additional_info[0] > 0). \
unsqueeze(1). \
repeat(1, tgt_seq_length, 1). \
unsqueeze(1)
mask_encoder = mask_encoder.float()
mask_encoder = (1.0 - mask_encoder) * -10000.0

mask_decoder = torch.ones(tgt_seq_length, tgt_seq_length, device=emb.device)
mask_decoder = torch.tril(mask_decoder)
mask_decoder = (1.0 - mask_decoder) * -10000
mask_decoder = mask_decoder.repeat(batch_size, 1, 1, 1)

hidden = emb

if self.relative_position_embedding:
self_position_bias = self.self_pos_emb(hidden, hidden)
context_position_bias = self.context_pos_emb(hidden, memory_bank)
else:
self_position_bias = None
context_position_bias = None

for i in range(self.layers_num):
hidden = self.transformer_decoder[i](hidden, memory_bank, mask_decoder, mask_encoder, self_position_bias, context_position_bias)

if self.layernorm_positioning == "pre":
return self.layer_norm(hidden)
else:
return hidden

17 changes: 17 additions & 0 deletions uer/encoders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from uer.encoders.transformer_encoder import TransformerEncoder
from uer.encoders.rnn_encoder import RnnEncoder
from uer.encoders.rnn_encoder import LstmEncoder
from uer.encoders.rnn_encoder import GruEncoder
from uer.encoders.rnn_encoder import BirnnEncoder
from uer.encoders.rnn_encoder import BilstmEncoder
from uer.encoders.rnn_encoder import BigruEncoder
from uer.encoders.cnn_encoder import GatedcnnEncoder


str2encoder = {"transformer": TransformerEncoder, "rnn": RnnEncoder, "lstm": LstmEncoder,
"gru": GruEncoder, "birnn": BirnnEncoder, "bilstm": BilstmEncoder, "bigru": BigruEncoder,
"gatedcnn": GatedcnnEncoder}

__all__ = ["TransformerEncoder", "RnnEncoder", "LstmEncoder", "GruEncoder", "BirnnEncoder",
"BilstmEncoder", "BigruEncoder", "GatedcnnEncoder", "str2encoder"]

Binary file added uer/encoders/__pycache__/__init__.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
72 changes: 72 additions & 0 deletions uer/encoders/cnn_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import torch
import torch.nn as nn


class GatedcnnEncoder(nn.Module):
def __init__(self, args):
super(GatedcnnEncoder, self).__init__()
self.layers_num = args.layers_num
self.kernel_size = args.kernel_size
self.block_size = args.block_size
self.emb_size = args.emb_size
self.hidden_size = args.hidden_size

self.conv_1 = nn.Conv2d(1, args.hidden_size, (args.kernel_size, args.emb_size))
self.gate_1 = nn.Conv2d(1, args.hidden_size, (args.kernel_size, args.emb_size))

self.conv_b1 = nn.Parameter(torch.randn(1, args.hidden_size, 1, 1))
self.gate_b1 = nn.Parameter(torch.randn(1, args.hidden_size, 1, 1))

self.conv = nn.ModuleList(
[
nn.Conv2d(args.hidden_size, args.hidden_size, (args.kernel_size, 1))
for _ in range(args.layers_num - 1)
]
)
self.gate = nn.ModuleList(
[
nn.Conv2d(args.hidden_size, args.hidden_size, (args.kernel_size, 1))
for _ in range(args.layers_num - 1)
]
)

self.conv_b = nn.ParameterList(
nn.Parameter(torch.randn(1, args.hidden_size, 1, 1))
for _ in range(args.layers_num - 1)
)
self.gate_b = nn.ParameterList(
nn.Parameter(torch.randn(1, args.hidden_size, 1, 1))
for _ in range(args.layers_num - 1)
)

def forward(self, emb, seg):
batch_size, seq_length, _ = emb.size()

padding = torch.zeros([batch_size, self.kernel_size-1, self.emb_size]).to(emb.device)
emb = torch.cat([padding, emb], dim=1).unsqueeze(1) # batch_size, 1, seq_length+width-1, emb_size

hidden = self.conv_1(emb)
hidden += self.conv_b1.repeat(1, 1, seq_length, 1)
gate = self.gate_1(emb)
gate += self.gate_b1.repeat(1, 1, seq_length, 1)
hidden = hidden * torch.sigmoid(gate)

res_input = hidden

padding = torch.zeros([batch_size, self.hidden_size, self.kernel_size-1, 1]).to(emb.device)
hidden = torch.cat([padding, hidden], dim=2)

for i, (conv_i, gate_i) in enumerate(zip(self.conv, self.gate)):
hidden, gate = conv_i(hidden), gate_i(hidden)
hidden += self.conv_b[i].repeat(1, 1, seq_length, 1)
gate += self.gate_b[i].repeat(1, 1, seq_length, 1)
hidden = hidden * torch.sigmoid(gate)
if (i + 1) % self.block_size == 0:
hidden = hidden + res_input
res_input = hidden
hidden = torch.cat([padding, hidden], dim=2)

hidden = hidden[:, :, self.kernel_size - 1:, :]
output = hidden.transpose(1, 2).contiguous().view(batch_size, seq_length, self.hidden_size)

return output
Loading

0 comments on commit e376fe3

Please sign in to comment.