This repository has been archived by the owner on May 27, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 223
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #27 from sooftware/master
Mount TTS
- Loading branch information
Showing
25 changed files
with
3,314 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pororo.models.tts.synthesizer import MultilingualSpeechSynthesizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import torch | ||
import os | ||
import glob | ||
|
||
|
||
def load_checkpoint(filepath, device): | ||
assert os.path.isfile(filepath) | ||
checkpoint_dict = torch.load(filepath, map_location=device) | ||
return checkpoint_dict | ||
|
||
|
||
def scan_checkpoint(cp_dir, prefix): | ||
pattern = os.path.join(cp_dir, prefix + '*') | ||
cp_list = glob.glob(pattern) | ||
if len(cp_list) == 0: | ||
return '' | ||
return sorted(cp_list)[-1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,289 @@ | ||
import torch | ||
import torch.nn.functional as F | ||
import torch.nn as nn | ||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d | ||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm | ||
|
||
LRELU_SLOPE = 0.1 | ||
|
||
|
||
def init_weights(m, mean=0.0, std=0.01): | ||
classname = m.__class__.__name__ | ||
if classname.find("Conv") != -1: | ||
m.weight.data.normal_(mean, std) | ||
|
||
|
||
def get_padding(kernel_size, dilation=1): | ||
return int((kernel_size*dilation - dilation)/2) | ||
|
||
|
||
def feature_loss(fmap_r, fmap_g): | ||
loss = 0 | ||
for dr, dg in zip(fmap_r, fmap_g): | ||
for rl, gl in zip(dr, dg): | ||
loss += torch.mean(torch.abs(rl - gl)) | ||
return loss*2 | ||
|
||
|
||
def discriminator_loss(disc_real_outputs, disc_generated_outputs): | ||
loss = 0 | ||
r_losses = [] | ||
g_losses = [] | ||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs): | ||
r_loss = torch.mean((1-dr)**2) | ||
g_loss = torch.mean(dg**2) | ||
loss += (r_loss + g_loss) | ||
r_losses.append(r_loss.item()) | ||
g_losses.append(g_loss.item()) | ||
|
||
return loss, r_losses, g_losses | ||
|
||
|
||
def generator_loss(disc_outputs): | ||
loss = 0 | ||
gen_losses = [] | ||
for dg in disc_outputs: | ||
l = torch.mean((1-dg)**2) | ||
gen_losses.append(l) | ||
loss += l | ||
|
||
return loss, gen_losses | ||
|
||
|
||
class ResBlock1(nn.Module): | ||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)): | ||
super(ResBlock1, self).__init__() | ||
self.h = h | ||
self.convs1 = nn.ModuleList([ | ||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], | ||
padding=get_padding(kernel_size, dilation[0]))), | ||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], | ||
padding=get_padding(kernel_size, dilation[1]))), | ||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], | ||
padding=get_padding(kernel_size, dilation[2]))) | ||
]) | ||
self.convs1.apply(init_weights) | ||
|
||
self.convs2 = nn.ModuleList([ | ||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, | ||
padding=get_padding(kernel_size, 1))), | ||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, | ||
padding=get_padding(kernel_size, 1))), | ||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, | ||
padding=get_padding(kernel_size, 1))) | ||
]) | ||
self.convs2.apply(init_weights) | ||
|
||
def forward(self, x): | ||
for c1, c2 in zip(self.convs1, self.convs2): | ||
xt = F.leaky_relu(x, LRELU_SLOPE) | ||
xt = c1(xt) | ||
xt = F.leaky_relu(xt, LRELU_SLOPE) | ||
xt = c2(xt) | ||
x = xt + x | ||
return x | ||
|
||
def remove_weight_norm(self): | ||
for l in self.convs1: | ||
remove_weight_norm(l) | ||
for l in self.convs2: | ||
remove_weight_norm(l) | ||
|
||
|
||
class ResBlock2(nn.Module): | ||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)): | ||
super(ResBlock2, self).__init__() | ||
self.h = h | ||
self.convs = nn.ModuleList([ | ||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], | ||
padding=get_padding(kernel_size, dilation[0]))), | ||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], | ||
padding=get_padding(kernel_size, dilation[1]))) | ||
]) | ||
self.convs.apply(init_weights) | ||
|
||
def forward(self, x): | ||
for c in self.convs: | ||
xt = F.leaky_relu(x, LRELU_SLOPE) | ||
xt = c(xt) | ||
x = xt + x | ||
return x | ||
|
||
def remove_weight_norm(self): | ||
for l in self.convs: | ||
remove_weight_norm(l) | ||
|
||
|
||
class Generator(nn.Module): | ||
def __init__(self, h): | ||
super(Generator, self).__init__() | ||
self.h = h | ||
self.num_kernels = len(h.resblock_kernel_sizes) | ||
self.num_upsamples = len(h.upsample_rates) | ||
self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)) | ||
resblock = ResBlock1 if h.resblock == '1' else ResBlock2 | ||
|
||
self.ups = nn.ModuleList() | ||
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): | ||
self.ups.append(weight_norm( | ||
ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)), | ||
k, u, padding=(k-u)//2))) | ||
|
||
self.resblocks = nn.ModuleList() | ||
for i in range(len(self.ups)): | ||
ch = h.upsample_initial_channel//(2**(i+1)) | ||
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): | ||
self.resblocks.append(resblock(h, ch, k, d)) | ||
|
||
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) | ||
self.ups.apply(init_weights) | ||
self.conv_post.apply(init_weights) | ||
|
||
def forward(self, x): | ||
x = self.conv_pre(x) | ||
for i in range(self.num_upsamples): | ||
x = F.leaky_relu(x, LRELU_SLOPE) | ||
x = self.ups[i](x) | ||
xs = None | ||
for j in range(self.num_kernels): | ||
if xs is None: | ||
xs = self.resblocks[i*self.num_kernels+j](x) | ||
else: | ||
xs += self.resblocks[i*self.num_kernels+j](x) | ||
x = xs / self.num_kernels | ||
x = F.leaky_relu(x) | ||
x = self.conv_post(x) | ||
x = torch.tanh(x) | ||
|
||
return x | ||
|
||
def remove_weight_norm(self): | ||
for l in self.ups: | ||
remove_weight_norm(l) | ||
for l in self.resblocks: | ||
l.remove_weight_norm() | ||
remove_weight_norm(self.conv_pre) | ||
remove_weight_norm(self.conv_post) | ||
|
||
|
||
class DiscriminatorP(torch.nn.Module): | ||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): | ||
super(DiscriminatorP, self).__init__() | ||
self.period = period | ||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm | ||
self.convs = nn.ModuleList([ | ||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), | ||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), | ||
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), | ||
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))), | ||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))), | ||
]) | ||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) | ||
|
||
def forward(self, x): | ||
fmap = [] | ||
|
||
# 1d to 2d | ||
b, c, t = x.shape | ||
if t % self.period != 0: # pad first | ||
n_pad = self.period - (t % self.period) | ||
x = F.pad(x, (0, n_pad), "reflect") | ||
t = t + n_pad | ||
x = x.view(b, c, t // self.period, self.period) | ||
|
||
for l in self.convs: | ||
x = l(x) | ||
x = F.leaky_relu(x, LRELU_SLOPE) | ||
fmap.append(x) | ||
x = self.conv_post(x) | ||
fmap.append(x) | ||
x = torch.flatten(x, 1, -1) | ||
|
||
return x, fmap | ||
|
||
|
||
class MultiPeriodDiscriminator(torch.nn.Module): | ||
def __init__(self): | ||
super(MultiPeriodDiscriminator, self).__init__() | ||
self.discriminators = nn.ModuleList([ | ||
DiscriminatorP(2), | ||
DiscriminatorP(3), | ||
DiscriminatorP(5), | ||
DiscriminatorP(7), | ||
DiscriminatorP(11), | ||
]) | ||
|
||
def forward(self, y, y_hat): | ||
y_d_rs = [] | ||
y_d_gs = [] | ||
fmap_rs = [] | ||
fmap_gs = [] | ||
for i, d in enumerate(self.discriminators): | ||
y_d_r, fmap_r = d(y) | ||
y_d_g, fmap_g = d(y_hat) | ||
y_d_rs.append(y_d_r) | ||
fmap_rs.append(fmap_r) | ||
y_d_gs.append(y_d_g) | ||
fmap_gs.append(fmap_g) | ||
|
||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs | ||
|
||
|
||
class DiscriminatorS(torch.nn.Module): | ||
def __init__(self, use_spectral_norm=False): | ||
super(DiscriminatorS, self).__init__() | ||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm | ||
self.convs = nn.ModuleList([ | ||
norm_f(Conv1d(1, 128, 15, 1, padding=7)), | ||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)), | ||
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)), | ||
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)), | ||
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)), | ||
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)), | ||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), | ||
]) | ||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) | ||
|
||
def forward(self, x): | ||
fmap = [] | ||
for l in self.convs: | ||
x = l(x) | ||
x = F.leaky_relu(x, LRELU_SLOPE) | ||
fmap.append(x) | ||
x = self.conv_post(x) | ||
fmap.append(x) | ||
x = torch.flatten(x, 1, -1) | ||
|
||
return x, fmap | ||
|
||
|
||
class MultiScaleDiscriminator(torch.nn.Module): | ||
def __init__(self): | ||
super(MultiScaleDiscriminator, self).__init__() | ||
self.discriminators = nn.ModuleList([ | ||
DiscriminatorS(use_spectral_norm=True), | ||
DiscriminatorS(), | ||
DiscriminatorS(), | ||
]) | ||
self.meanpools = nn.ModuleList([ | ||
AvgPool1d(4, 2, padding=2), | ||
AvgPool1d(4, 2, padding=2) | ||
]) | ||
|
||
def forward(self, y, y_hat): | ||
y_d_rs = [] | ||
y_d_gs = [] | ||
fmap_rs = [] | ||
fmap_gs = [] | ||
for i, d in enumerate(self.discriminators): | ||
if i != 0: | ||
y = self.meanpools[i-1](y) | ||
y_hat = self.meanpools[i-1](y_hat) | ||
y_d_r, fmap_r = d(y) | ||
y_d_g, fmap_g = d(y_hat) | ||
y_d_rs.append(y_d_r) | ||
fmap_rs.append(fmap_r) | ||
y_d_gs.append(y_d_g) | ||
fmap_gs.append(fmap_g) | ||
|
||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import torch | ||
|
||
from pororo.models.tts.tacotron.params import Params as hp | ||
from pororo.models.tts.utils import audio, text | ||
|
||
|
||
def synthesize(model, input_data, force_cpu=False, device=None): | ||
item = input_data.split("|") | ||
clean_text = item[1] | ||
|
||
if not hp.use_punctuation: | ||
clean_text = text.remove_punctuation(clean_text) | ||
if not hp.case_sensitive: | ||
clean_text = text.to_lower(clean_text) | ||
if hp.remove_multiple_wspaces: | ||
clean_text = text.remove_odd_whitespaces(clean_text) | ||
|
||
t = torch.LongTensor( | ||
text.to_sequence(clean_text, use_phonemes=hp.use_phonemes)) | ||
|
||
if hp.multi_language: | ||
l_tokens = item[3].split(",") | ||
t_length = len(clean_text) + 1 | ||
l = [] | ||
for token in l_tokens: | ||
l_d = token.split("-") | ||
|
||
language = [0] * hp.language_number | ||
for l_cw in l_d[0].split(":"): | ||
l_cw_s = l_cw.split("*") | ||
language[hp.languages.index( | ||
l_cw_s[0])] = (1 if len(l_cw_s) == 1 else float(l_cw_s[1])) | ||
|
||
language_length = int(l_d[1]) if len(l_d) == 2 else t_length | ||
l += [language] * language_length | ||
t_length -= language_length | ||
l = torch.FloatTensor([l]) | ||
else: | ||
l = None | ||
|
||
s = (torch.LongTensor([hp.unique_speakers.index(item[2])]) if hp.multi_speaker else None) | ||
|
||
if torch.cuda.is_available() and not force_cpu: | ||
t = t.to(device) | ||
if l is not None: | ||
l = l.to(device) | ||
if s is not None: | ||
s = s.to(device) | ||
|
||
s = model.inference(t, speaker=s, language=l).cpu().detach().numpy() | ||
s = audio.denormalize_spectrogram(s, not hp.predict_linear) | ||
|
||
return s |
Oops, something went wrong.