forked from zxjzxj9/PyTorchIntroduction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeep_speech.py
140 lines (114 loc) · 4.25 KB
/
deep_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
""" 本代码仅作为DeepSpeech模型的实现参考
class BNGRU(nn.Module):
def __init__(self, input_size, hidden_size):
super(BNGRU, self).__init__()
self.hidden_size = hidden_size
self.bn = nn.BatchNorm1d(input_size)
self.gru = nn.GRU(input_size, hidden_size, bidirectional=True)
def forward(self, x, xlen):
maxlen = x.size(2)
x = self.bn(x)
# N×C×T -> T×N×C
x = x.permute(2, 0, 1)
x = nn.utils.rnn.pack_padded_sequence(x, xlen)
x, _ = self.gru(x)
x, _ = nn.utils.rnn.pad_packed_sequence(x, total_length=maxlen)
x = x[..., :self.hidden_size] + x[..., self.hidden_size:]
# T×N×C -> N×C×T
x = x.permute(1, 2, 0)
return x
class DeepSpeech(nn.Module):
def __init__(self, mel_channel, channels, kernel_dims, strides,
num_layers, hidden_size, char_size):
super(DeepSpeech, self).__init__()
self.kernel_dims = kernel_dims
self.strides = strides
self.num_layers = num_layers
self.hidden_size = hidden_size
self.char_size = char_size
self.cnns = nn.ModuleList()
in_channel = mel_channel
for c, k, s in zip(channels, kernel_dims, strides):
self.cnns.append(nn.Conv1d(in_channel, c, k,
stride=s, padding=c//2))
self.cnns.append(nn.BatchNorm1d(c))
self.cnns.append(nn.ReLU(inplace=True))
in_channel = c
self.cnns = nn.Sequential(*self.cnns)
self.rnns = nn.ModuleList()
for _ in range(num_layers):
self.rnns.append(BNGRU(in_channel, hidden_size))
in_channel = hidden_size
self.norm = nn.BatchNorm1d(hidden_size)
self.proj = nn.Sequential(
nn.Linear(hidden_size, char_size)
)
def forward(self, x, xlen):
# T×N×C -> N×C×T
x = x.permute(1, 2, 0)
x = self.cnns(x)
for rnn in self.rnns:
x = rnn(x, xlen)
x = self.norm(x)
# N×C×T -> T×N×C
x = x.permute(2, 0, 1)
x = self.proj(x)
return F.log_softmax(x, -1)
"""
import torch
import torch.nn as nn
class BNGRU(nn.Module):
def __init__(self, input_size, hidden_size):
super(BNGRU, self).__init__()
self.hidden_size = hidden_size
self.bn = nn.BatchNorm1d(input_size)
self.gru = nn.GRU(input_size, hidden_size, bidirectional=True)
def forward(self, x, xlen):
maxlen = x.size(2)
x = self.bn(x)
# N×C×T -> T×N×C
x = x.permute(2, 0, 1)
x = nn.utils.rnn.pack_padded_sequence(x, xlen)
x, _ = self.gru(x)
x, _ = nn.utils.rnn.pad_packed_sequence(x, total_length=maxlen)
x = x[..., :self.hidden_size] + x[..., self.hidden_size:]
# T×N×C -> N×C×T
x = x.permute(1, 2, 0)
return x
class DeepSpeech(nn.Module):
def __init__(self, mel_channel, channels, kernel_dims, strides,
num_layers, hidden_size, char_size):
super(DeepSpeech, self).__init__()
self.kernel_dims = kernel_dims
self.strides = strides
self.num_layers = num_layers
self.hidden_size = hidden_size
self.char_size = char_size
self.cnns = nn.ModuleList()
in_channel = mel_channel
for c, k, s in zip(channels, kernel_dims, strides):
self.cnns.append(nn.Conv1d(in_channel, c, k,
stride=s, padding=c//2))
self.cnns.append(nn.BatchNorm1d(c))
self.cnns.append(nn.ReLU(inplace=True))
in_channel = c
self.cnns = nn.Sequential(*self.cnns)
self.rnns = nn.ModuleList()
for _ in range(num_layers):
self.rnns.append(BNGRU(in_channel, hidden_size))
in_channel = hidden_size
self.norm = nn.BatchNorm1d(hidden_size)
self.proj = nn.Sequential(
nn.Linear(hidden_size, char_size)
)
def forward(self, x, xlen):
# T×N×C -> N×C×T
x = x.permute(1, 2, 0)
x = self.cnns(x)
for rnn in self.rnns:
x = rnn(x, xlen)
x = self.norm(x)
# N×C×T -> T×N×C
x = x.permute(2, 0, 1)
x = self.proj(x)
return F.log_softmax(x, -1)