Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Task 3】 Construction of Dataset and Method Implementation for Named Entity Recognition in the Open Source Community #75

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions task3/NER/Config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os

class config:
root = os.getcwd()
dataset = 'chinese ner'
train_data_path = os.path.join(root, 'input/train.json')
dev_data_path = os.path.join(root, 'input/dev.json')
test_data_path = os.path.join(root, 'input/test.json')

cache_path = os.path.join(root, 'cache/')

save_path = os.path.join(root, 'saved_models/model.pt')
predict_path = os.path.join(root, 'output/predict.json')

dist_emb_size = 20
type_emb_size = 20
lstm_hid_size = 512
conv_hid_size = 96
bert_hid_size = 768
biaffine_size = 512
ffnn_hid_size = 288

dilation = [1, 2, 3]

emb_dropout = 0.5
conv_dropout = 0.5
out_dropout = 0.33

epochs = 10
batch_size = 4
checkout_params = {'batch_size': 1, 'shuffle': False}
train_params = {'batch_size': 1, 'shuffle': True}
dev_params = {'batch_size': 1, 'shuffle': False}
test_params = {'batch_size': 1, 'shuffle': False}

learning_rate = 1e-3
weight_decay = 0
clip_grad_norm = 5.0
bert_name = 'bert-base-uncased'
bert_learning_rate = 5e-6
warm_factor = 0.1

use_bert_last_4_layers = True

seed = 2022
logger = None
253 changes: 253 additions & 0 deletions task3/NER/Model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import AutoModel


class LayerNorm(nn.Module):
def __init__(self, input_dim, cond_dim=0, center=True, scale=True, epsilon=None, conditional=False,
hidden_units=None, hidden_activation='linear', hidden_initializer='xaiver', **kwargs):
super(LayerNorm, self).__init__()
"""
input_dim: inputs.shape[-1]
cond_dim: cond.shape[-1]
"""
self.center = center
self.scale = scale
self.conditional = conditional
self.hidden_units = hidden_units
self.hidden_initializer = hidden_initializer
self.epsilon = epsilon or 1e-12
self.input_dim = input_dim
self.cond_dim = cond_dim

if self.center:
self.beta = nn.Parameter(torch.zeros(input_dim))
if self.scale:
self.gamma = nn.Parameter(torch.ones(input_dim))

if self.conditional:
if self.hidden_units is not None:
self.hidden_dense = nn.Linear(in_features=self.cond_dim, out_features=self.hidden_units, bias=False)
if self.center:
self.beta_dense = nn.Linear(in_features=self.cond_dim, out_features=input_dim, bias=False)
if self.scale:
self.gamma_dense = nn.Linear(in_features=self.cond_dim, out_features=input_dim, bias=False)

self.initialize_weights()

def initialize_weights(self):

if self.conditional:
if self.hidden_units is not None:
if self.hidden_initializer == 'normal':
torch.nn.init.normal(self.hidden_dense.weight)
elif self.hidden_initializer == 'xavier': # glorot_uniform
torch.nn.init.xavier_uniform_(self.hidden_dense.weight)

if self.center:
torch.nn.init.constant_(self.beta_dense.weight, 0)
if self.scale:
torch.nn.init.constant_(self.gamma_dense.weight, 0)

def forward(self, inputs, cond=None):
if self.conditional:
if self.hidden_units is not None:
cond = self.hidden_dense(cond)

for _ in range(len(inputs.shape) - len(cond.shape)):
cond = cond.unsqueeze(1) # cond = K.expand_dims(cond, 1)

if self.center:
beta = self.beta_dense(cond) + self.beta
if self.scale:
gamma = self.gamma_dense(cond) + self.gamma
else:
if self.center:
beta = self.beta
if self.scale:
gamma = self.gamma

outputs = inputs
if self.center:
mean = torch.mean(outputs, dim=-1).unsqueeze(-1)
outputs = outputs - mean
if self.scale:
variance = torch.mean(outputs ** 2, dim=-1).unsqueeze(-1)
std = (variance + self.epsilon) ** 0.5
outputs = outputs / std
outputs = outputs * gamma
if self.center:
outputs = outputs + beta

return outputs


class ConvolutionLayer(nn.Module):
def __init__(self, input_size, channels, dilation, dropout=0.1):
super(ConvolutionLayer, self).__init__()
self.base = nn.Sequential(
nn.Dropout2d(dropout),
nn.Conv2d(input_size, channels, kernel_size=1),
nn.GELU(),
)

self.convs = nn.ModuleList(
[nn.Conv2d(channels, channels, kernel_size=3, groups=channels, dilation=d, padding=d) for d in dilation])

def forward(self, x):
x = x.permute(0, 3, 1, 2).contiguous()
x = self.base(x)

outputs = []
for conv in self.convs:
x = conv(x)
x = F.gelu(x)
outputs.append(x)
outputs = torch.cat(outputs, dim=1)
outputs = outputs.permute(0, 2, 3, 1).contiguous()
return outputs


class Biaffine(nn.Module):
def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
super(Biaffine, self).__init__()

self.n_in = n_in
self.n_out = n_out
self.bias_x = bias_x
self.bias_y = bias_y
weight = torch.zeros((n_out, n_in + int(bias_x), n_in + int(bias_y)))
nn.init.xavier_normal_(weight)
self.weight = nn.Parameter(weight, requires_grad=True)

def extra_repr(self):
s = f"n_in={self.n_in}, n_out={self.n_out}"
if self.bias_x:
s += f", bias_x={self.bias_x}"
if self.bias_y:
s += f", bias_y={self.bias_y}"

return s

def forward(self, x, y):
if self.bias_x:
x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
if self.bias_y:
y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
# [batch_size, n_out, seq_len, seq_len]
s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
# remove dim 1 if n_out == 1
s = s.permute(0, 2, 3, 1)

return s


class MLP(nn.Module):
def __init__(self, n_in, n_out, dropout=0):
super().__init__()

self.linear = nn.Linear(n_in, n_out)
self.activation = nn.GELU()
self.dropout = nn.Dropout(dropout)

def forward(self, x):
x = self.dropout(x)
x = self.linear(x)
x = self.activation(x)
return x


class CoPredictor(nn.Module):
def __init__(self, cls_num, hid_size, biaffine_size, channels, ffnn_hid_size, dropout=0):
super().__init__()
self.mlp1 = MLP(n_in=hid_size, n_out=biaffine_size, dropout=dropout)
self.mlp2 = MLP(n_in=hid_size, n_out=biaffine_size, dropout=dropout)
self.biaffine = Biaffine(n_in=biaffine_size, n_out=cls_num, bias_x=True, bias_y=True)
self.mlp_rel = MLP(channels, ffnn_hid_size, dropout=dropout)
self.linear = nn.Linear(ffnn_hid_size, cls_num)
self.dropout = nn.Dropout(dropout)

def forward(self, x, y, z):
h = self.dropout(self.mlp1(x))
t = self.dropout(self.mlp2(y))
o1 = self.biaffine(h, t)

z = self.dropout(self.mlp_rel(z))
o2 = self.linear(z)
return o1 + o2


class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.use_bert_last_4_layers = config.use_bert_last_4_layers

self.lstm_hid_size = config.lstm_hid_size
self.conv_hid_size = config.conv_hid_size

lstm_input_size = 0

self.bert = AutoModel.from_pretrained(config.bert_name, cache_dir="./cache/", output_hidden_states=True)
lstm_input_size += config.bert_hid_size

self.dis_embs = nn.Embedding(20, config.dist_emb_size)
self.reg_embs = nn.Embedding(3, config.type_emb_size)

self.encoder = nn.LSTM(lstm_input_size, config.lstm_hid_size // 2, num_layers=1, batch_first=True,
bidirectional=True)

conv_input_size = config.lstm_hid_size + config.dist_emb_size + config.type_emb_size

self.convLayer = ConvolutionLayer(conv_input_size, config.conv_hid_size, config.dilation, config.conv_dropout)
self.dropout = nn.Dropout(config.emb_dropout)
self.predictor = CoPredictor(config.label_num, config.lstm_hid_size, config.biaffine_size,
config.conv_hid_size * len(config.dilation), config.ffnn_hid_size,
config.out_dropout)

self.cln = LayerNorm(config.lstm_hid_size, config.lstm_hid_size, conditional=True)

def forward(self, bert_inputs, grid_mask2d, dist_inputs, pieces2word, sent_length):
'''
:param bert_inputs: [B, L']
:param grid_mask2d: [B, L, L]
:param dist_inputs: [B, L, L]
:param pieces2word: [B, L, L']
:param sent_length: [B]
:return:
'''
bert_embs = self.bert(input_ids=bert_inputs, attention_mask=bert_inputs.ne(0).float())
if self.use_bert_last_4_layers:
bert_embs = torch.stack(bert_embs[2][-4:], dim=-1).mean(-1)
else:
bert_embs = bert_embs[0]

length = pieces2word.size(1)

min_value = torch.min(bert_embs).item()

# Max pooling word representations from pieces
_bert_embs = bert_embs.unsqueeze(1).expand(-1, length, -1, -1)
_bert_embs = torch.masked_fill(_bert_embs, pieces2word.eq(0).unsqueeze(-1), min_value)
word_reps, _ = torch.max(_bert_embs, dim=2)

word_reps = self.dropout(word_reps)
packed_embs = pack_padded_sequence(word_reps, sent_length.cpu(), batch_first=True, enforce_sorted=False)
packed_outs, (hidden, _) = self.encoder(packed_embs)
word_reps, _ = pad_packed_sequence(packed_outs, batch_first=True, total_length=sent_length.max())

cln = self.cln(word_reps.unsqueeze(2), word_reps)

dis_emb = self.dis_embs(dist_inputs)
tril_mask = torch.tril(grid_mask2d.clone().long())
reg_inputs = tril_mask + grid_mask2d.clone().long()
reg_emb = self.reg_embs(reg_inputs)

conv_inputs = torch.cat([dis_emb, reg_emb, cln], dim=-1)
conv_inputs = torch.masked_fill(conv_inputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
conv_outputs = self.convLayer(conv_inputs)
conv_outputs = torch.masked_fill(conv_outputs, grid_mask2d.eq(0).unsqueeze(-1), 0.0)
outputs = self.predictor(word_reps, word_reps, conv_outputs)

return outputs
44 changes: 44 additions & 0 deletions task3/NER/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# W2NER

本代码参考 W2NER论文及模型编写,论文链接[AAAI Press Formatting Instructions for Authors Using LaTeX -- A Guide (arxiv.org)](https://arxiv.org/pdf/2112.10070.pdf)



## 代码运行

在colab上运行main.py即可(或许上传空文件夹时colab会忽略,需手动创建,比如output、saved_models等)



## 代码结构

由于本人并不是一开始就使用了W2NER模型,各个模型之间的代码也是有所不同的,我将代码分为utils、Model、Config、Trainer、main,只需重写utils内部的APIs和Model(或许有部分Trainer)即可迁移使用不同的模型。

### utils

分为common和DataProcess以及对不同文本的接口APIs

#### common

提供logger、read_from_file、write_to_file函数

#### DataProcess

提供Process类,是数据预处理的接口,分为encode预处理和decode格式化

#### APIs

需重写APIDataset、api_encode、api_decode,分别对应不同的接口,其中api_encode接收原始json数据,返回APIDataset所接收的数据;APIDataset接收api_encode的数据,返回Model所需的Dataset;api_decode接收Model输出的output,返回格式化的字典(json)数据

### Model

模型主体

### Config

通用配置

### Trainer

训练类,分为train、eval、predict

Loading