-
Notifications
You must be signed in to change notification settings - Fork 192
/
Copy pathmodel.py
178 lines (162 loc) · 8.71 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
import torch
import logging
import torch.nn as nn
import torch.nn.functional as F
from utils.greedysearch import GreedySearchDecoder
class EncoderRNN(nn.Module):
def __init__(self, opt, voc_length):
'''
voc_length: 字典长度,即输入的单词的one-hot编码长度
'''
super(EncoderRNN, self).__init__()
self.num_layers = opt.num_layers
self.hidden_size = opt.hidden_size
#nn.Embedding输入向量维度是字典长度,输出向量维度是词向量维度
self.embedding = nn.Embedding(voc_length, opt.embedding_dim)
#双向GRU作为Encoder
self.gru = nn.GRU(opt.embedding_dim, self.hidden_size, self.num_layers,
dropout=(0 if opt.num_layers == 1 else opt.dropout), bidirectional=opt.bidirectional)
def forward(self, input_seq, input_lengths, hidden=None):
'''
input_seq:
shape: [max_seq_len, batch_size]
input_lengths:
一批次中每个句子对应的句子长度列表
shape:[batch_size]
hidden:
Encoder的初始hidden输入,默认为None
shape: [num_layers*num_directions, batch_size, hidden_size]
实际排列顺序是num_directions在前面,
即对于4层双向的GRU, num_layers*num_directions = 8
前4层是正向: [:4, batch_size, hidden_size]
后4层是反向: [4:, batch_size, hidden_size]
embedded:
经过词嵌入后的词向量
shape: [max_seq_len, batch_size, embedding_dim]
outputs:
所有时刻的hidden层输出
一开始的shape: [max_seq_len, batch_size, hidden_size*num_directions]
注意: num_directions在前面, 即前面hidden_size个是正向的,后面hidden_size个是反向的
正向: [:, :, :hidden_size] 反向: [:, :, hidden_size:]
最后对双向GRU求和,得到最终的outputs: shape为[max_seq_len, batch_size, hidden_size]
输出的hidden:
[num_layers*num_directions, batch_size, hidden_size]
'''
embedded = self.embedding(input_seq)
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
outputs, hidden = self.gru(packed, hidden)
outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
return outputs, hidden
class Attn(torch.nn.Module):
def __init__(self, attn_method, hidden_size):
super(Attn, self).__init__()
self.method = attn_method #attention方法
self.hidden_size = hidden_size
if self.method not in ['dot', 'general', 'concat']:
raise ValueError(self.method, "is not an appropriate attention method.")
if self.method == 'general':
self.attn = torch.nn.Linear(self.hidden_size, self.hidden_size)
elif self.method == 'concat':
self.attn = torch.nn.Linear(self.hidden_size * 2, self.hidden_size)
self.v = torch.nn.Parameter(torch.FloatTensor(self.hidden_size))
def dot_score(self, hidden, encoder_outputs):
'''
encoder_outputs:
encoder(双向GRU)的所有时刻的最后一层的hidden输出
shape: [max_seq_len, batch_size, hidden_size]
数学符号表示: h_s
hidden:
decoder(单向GRU)的所有时刻的最后一层的hidden输出,即decoder_ouputs
shape: [max_seq_len, batch_size, hidden_size]
数学符号表示: h_t
注意: attention method: 'dot', Hadamard乘法,对应元素相乘,用*就好了
torch.matmul是矩阵乘法, 所以最后的结果是h_s * h_t
h_s的元素是一个hidden_size向量, 要得到score值,需要在dim=2上求和
相当于先不看batch_size,h_s * h_t 要得到的是 [max_seq_len]
即每个时刻都要得到一个分数值, 最后把batch_size加进来,
最终shape为: [max_seq_len, batch_size]
'''
return torch.sum(hidden * encoder_outputs, dim=2)
def general_score(self, hidden, encoder_outputs):
#先学习到一个线性变换Wh_s,即energy
#然后进行点乘,所以最后结果为 h_t*Wh_s
energy = self.attn(encoder_outputs)
return torch.sum(hidden * energy, dim=2)
def concat_score(self, hidden, encoder_outputs):
'''
hidden:
h_t, shape: [max_seq_len, batch_size, hidden_size]
expand(max_seq_len, -1,-1) ==> [max_seq_len, batch_size, hidden_size]
与encoder_outputs在第2维上进行cat, 最后shape: [max_seq_len, batch_size, hidden_size*2]
经过attn后得到[max_seq_len, batch_size, hidden_size],再进行tanh,shape不变
最后与v乘
'''
energy = self.attn(torch.cat((hidden.expand(encoder_outputs.size(0), -1, -1),
encoder_outputs), 2)).tanh()
return torch.sum(self.v * energy, dim=2)
def forward(self, hidden, encoder_outputs):
if self.method == 'general':
attn_energies = self.general_score(hidden, encoder_outputs)
elif self.method == 'concat':
attn_energies = self.concat_score(hidden, encoder_outputs)
elif self.method == 'dot':
attn_energies = self.dot_score(hidden, encoder_outputs)
#得到score,shape为[max_seq_len, batch_size],然后转置为[batch_size, max_seq_len]
attn_energies = attn_energies.t()
#对dim=1进行softmax,然后插入维度[batch_size, 1, max_seq_len]
return F.softmax(attn_energies, dim=1).unsqueeze(1)
class LuongAttnDecoderRNN(nn.Module):
def __init__(self, opt, voc_length):
super(LuongAttnDecoderRNN, self).__init__()
self.attn_method = opt.method
self.hidden_size = opt.hidden_size
self.output_size = voc_length
self.num_layers = opt.num_layers
self.dropout = opt.dropout
self.embedding = nn.Embedding(voc_length, opt.embedding_dim)
self.embedding_dropout = nn.Dropout(self.dropout)
self.gru = nn.GRU(opt.embedding_dim, self.hidden_size, self.num_layers, dropout=(0 if self.num_layers == 1 else self.dropout))
self.concat = nn.Linear(self.hidden_size * 2, self.hidden_size)
self.out = nn.Linear(self.hidden_size, self.output_size)
self.attn = Attn(self.attn_method, self.hidden_size)
def forward(self, input_step, last_hidden, encoder_outputs):
'''
input_step:
decoder是逐字生成的,即每个timestep产生一个字,
decoder接收的输入: input_step='/SOS'的索引 和 encoder的最后时刻的最后一层hidden输出
故shape:[1, batch_size]
last_hidden:
上一个GRUCell的hidden输出
初始值为encoder的最后时刻的最后一层hidden输出,传入的是encoder_hidden的正向部分,
即encoder_hidden[:decoder.num_layers], 为了和decoder对应,所以取的是decoder的num_layers
shape为[num_layers, batch_size, hidden_size]
encoder_outputs:
这里还接收了encoder_outputs输入,用于计算attention
'''
#转为词向量[1, batch_size, embedding_dim]
embedded = self.embedding(input_step)
embedded = self.embedding_dropout(embedded)
#rnn_output: [1, batch_size, hidden_size]
#hidden: [num_layers, batch_size, hidden_size]
rnn_output, hidden = self.gru(embedded, last_hidden)
#attn_weights: [batch_size, 1, max_seq_len]
attn_weights = self.attn(rnn_output, encoder_outputs)
#bmm批量矩阵相乘,
#attn_weights是batch_size个矩阵[1, max_seq_len]
#encoder_outputs.transpose(0, 1)是batch_size个[max_seq_len, hidden_size]
#相乘结果context为: [batch_size, 1, hidden_size]
context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
rnn_output = rnn_output.squeeze(0)
context = context.squeeze(1)
#压缩维度后,rnn_output[batch_size, hidden_size],context[batch_size, hidden_size]
#在dim=1上连接: [batch_size, hidden_size * 2], contact后[batch_size, hidden_size]
#tanh后shape不变: [batch_size, hidden_size]
#最后进行全连接,映射为[batch_size, voc_length]
#最后进行softmax: [batch_size, voc_length]
concat_input = torch.cat((rnn_output, context), 1)
concat_output = torch.tanh(self.concat(concat_input))
output = self.out(concat_output)
output = F.softmax(output, dim=1)
return output, hidden