Skip to content

Commit

Permalink
more comment on tts frontend
Browse files Browse the repository at this point in the history
  • Loading branch information
zh794390558 committed Jun 7, 2023
1 parent 8aa9790 commit 42f2186
Show file tree
Hide file tree
Showing 9 changed files with 409 additions and 200 deletions.
Empty file.
33 changes: 20 additions & 13 deletions paddlespeech/t2s/exps/syn_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,23 @@ def norm(data, mean, std):
return (data - mean) / std


def get_chunks(data, block_size: int, pad_size: int):
data_len = data.shape[1]
def get_chunks(mel, chunk_size: int, pad_size: int):
"""
Split mel by chunk size with left and right context.
Args:
mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
chunk_size (int): chunk size
pad_size (int): size for left and right context.
"""
T = mel.shape[1]
n = math.ceil(T / chunk_size)

chunks = []
n = math.ceil(data_len / block_size)
for i in range(n):
start = max(0, i * block_size - pad_size)
end = min((i + 1) * block_size + pad_size, data_len)
chunks.append(data[:, start:end, :])
start = max(0, i * chunk_size - pad_size)
end = min((i + 1) * chunk_size + pad_size, T)
chunks.append(mel[:, start:end, :])
return chunks


Expand All @@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
with open(text_file, 'rt', encoding='utf-8') as f:
for line in f:
if line.strip() != "":
items = re.split(r"\s+", line.strip(), 1)
items = re.split(r"\s+", line.strip(), maxsplit=1)
assert len(items) == 2
utt_id = items[0]
if lang in {'zh', 'canton'}:
sentence = "".join(items[1:])
elif lang == 'en':
sentence = " ".join(items[1:])
elif lang == 'mix':
sentence = " ".join(items[1:])
sentence = items[1]
sentences.append((utt_id, sentence))
return sentences

Expand Down Expand Up @@ -319,6 +324,7 @@ def run_frontend(
input_ids = {}
if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
re.DOTALL):
# using ssml
input_ids = frontend.get_input_ids_ssml(
text,
merge_sentences=merge_sentences,
Expand Down Expand Up @@ -359,6 +365,7 @@ def run_frontend(
outs.update({'is_slurs': is_slurs})
else:
print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")

outs.update({'phone_ids': phone_ids})
return outs

Expand Down
26 changes: 26 additions & 0 deletions paddlespeech/t2s/exps/synthesize_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
import argparse
from pathlib import Path
from pprint import pprint

import paddle
import soundfile as sf
Expand Down Expand Up @@ -78,6 +79,7 @@ def evaluate(args):

# whether dygraph to static
if args.inference_dir:
print("convert am and voc to static model.")
# acoustic model
am_inference = am_to_static(
am_inference=am_inference,
Expand All @@ -92,6 +94,7 @@ def evaluate(args):

output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

merge_sentences = False
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
Expand All @@ -102,12 +105,18 @@ def evaluate(args):
if am_name == 'speedyspeech':
get_tone_ids = True

# wav samples
N = 0
# inference time cost
T = 0

# [(uid, text), ]
if am_name == 'diffsinger':
sentences = get_sentences_svs(text_file=args.text)
else:
sentences = get_sentences(text_file=args.text, lang=args.lang)
pprint(f"inputs: {sentences}")

for utt_id, sentence in sentences:
with timer() as t:
if am_name == "diffsinger":
Expand All @@ -116,6 +125,8 @@ def evaluate(args):
else:
text = sentence
svs_input = None

# frontend
frontend_dict = run_frontend(
frontend=frontend,
text=text,
Expand All @@ -124,25 +135,33 @@ def evaluate(args):
lang=args.lang,
svs_input=svs_input)
phone_ids = frontend_dict['phone_ids']
# pprint(f"process: {utt_id} {phone_ids}")

with paddle.no_grad():
flags = 0
for i in range(len(phone_ids)):
# sub phone, split by `sp` or punctuation.
part_phone_ids = phone_ids[i]

# acoustic model
if am_name == 'fastspeech2':
# multi speaker
if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
# multi-speaker
spk_id = paddle.to_tensor(args.spk_id)
mel = am_inference(part_phone_ids, spk_id)
else:
# single-speaker
mel = am_inference(part_phone_ids)
elif am_name == 'speedyspeech':
part_tone_ids = frontend_dict['tone_ids'][i]
if am_dataset in {"aishell3", "vctk", "mix"}:
# multi-speaker
spk_id = paddle.to_tensor(args.spk_id)
mel = am_inference(part_phone_ids, part_tone_ids,
spk_id)
else:
# single-speaker
mel = am_inference(part_phone_ids, part_tone_ids)
elif am_name == 'tacotron2':
mel = am_inference(part_phone_ids)
Expand All @@ -155,24 +174,31 @@ def evaluate(args):
note=part_note_ids,
note_dur=part_note_durs,
is_slur=part_is_slurs, )

# vocoder
wav = voc_inference(mel)
if flags == 0:
wav_all = wav
flags = 1
else:
wav_all = paddle.concat([wav_all, wav])

wav = wav_all.numpy()
N += wav.size
T += t.elapse

# samples per second
speed = wav.size / t.elapse
# generate one second wav need `RTF` seconds
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)

sf.write(
str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
print(f"{utt_id} done!")

print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")


Expand Down
120 changes: 75 additions & 45 deletions paddlespeech/t2s/frontend/arpabet.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,63 +11,75 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddlespeech.t2s.frontend.phonectic import Phonetics
"""
A phonology system with ARPABET symbols and limited punctuations. The G2P
conversion is done by g2p_en.
Note that g2p_en does not handle words with hypen well. So make sure the input
sentence is first normalized.
"""
from paddlespeech.t2s.frontend.vocab import Vocab
from g2p_en import G2p

from paddlespeech.t2s.frontend.phonectic import Phonetics
from paddlespeech.t2s.frontend.vocab import Vocab


class ARPABET(Phonetics):
"""A phonology for English that uses ARPABET as the phoneme vocabulary.
"""A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
0 — No stress
1 — Primary stress
2 — Secondary stress
Phoneme Set:
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
Phoneme Example Translation
------- ------- -----------
AA odd AA D
AE at AE T
AH hut HH AH T
AO ought AO T
AW cow K AW
AY hide HH AY D
B be B IY
CH cheese CH IY Z
D dee D IY
DH thee DH IY
EH Ed EH D
ER hurt HH ER T
EY ate EY T
F fee F IY
G green G R IY N
HH he HH IY
IH it IH T
IY eat IY T
JH gee JH IY
K key K IY
L lee L IY
M me M IY
N knee N IY
NG ping P IH NG
OW oat OW T
OY toy T OY
P pee P IY
R read R IY D
S sea S IY
SH she SH IY
T tea T IY
TH theta TH EY T AH
UH hood HH UH D
UW two T UW
V vee V IY
W we W IY
Y yield Y IY L D
Z zee Z IY
ZH seizure S IY ZH ER
"""
# 39 phonemes
phonemes = [
'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
Expand All @@ -76,6 +88,8 @@ class ARPABET(Phonetics):
]
punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations
# vowels carry a lexical stress marker:
# 0 unstressed(无重音), 1 primary stress(主重音)和 2 secondary stress(次重音)
_stress_to_no_stress_ = {
'AA0': 'AA',
'AA1': 'AA',
Expand Down Expand Up @@ -124,7 +138,12 @@ class ARPABET(Phonetics):
'UW2': 'UW'
}

def __repr__(self):
fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})"
return fmt.format(len(phonemes), punctuations)

def __init__(self):
# https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations)

Expand All @@ -139,6 +158,7 @@ def phoneticize(self, sentence, add_start_end=False):
Returns:
List[str]: The list of pronunciation sequence.
"""
# g2p and remove vowel stress
phonemes = [
self._remove_vowels(item) for item in self.backend(sentence)
]
Expand All @@ -158,6 +178,7 @@ def numericalize(self, phonemes):
Returns:
List[int]: The list of pronunciation id sequence.
"""
# phonemes to ids
ids = [self.vocab.lookup(item) for item in phonemes]
return ids

Expand Down Expand Up @@ -189,11 +210,16 @@ def __call__(self, sentence, add_start_end=False):
def vocab_size(self):
""" Vocab size.
"""
# 47 = 39 phones + 4 punctuations + 4 special tokens
# 47 = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
return len(self.vocab)


class ARPABETWithStress(Phonetics):
"""
A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
77 symbols = 69 phones + 4 punctuations + 4 special tokens
"""
phonemes = [
'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
Expand All @@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
punctuations = [',', '.', '?', '!']
symbols = phonemes + punctuations

def __repr__(self):
fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})"
return fmt.format(len(phonemes), punctuations)

def __init__(self):
self.backend = G2p()
self.vocab = Vocab(self.phonemes + self.punctuations)
Expand Down
3 changes: 2 additions & 1 deletion paddlespeech/t2s/frontend/polyphonic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,5 @@ polyphonic:
恶行: ['e4','xing2']
: ['ai4']
扎实: ['zha1','shi2']
干将: ['gan4','jiang4']
干将: ['gan4','jiang4']
陈威行: ['chen2', 'wei1', 'hang2']
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,14 @@ def get_dom_split(self, mixstr):
dom = DomXml(in_xml)
tags = dom.get_text_and_sayas_tags()
ctlist.extend(tags)

ctlist.append(after_xml)
return ctlist
else:
ctlist.append(mixstr)
return ctlist


class DomXml():
def __init__(self, xmlstr):
self.tdom = parseString(xmlstr) #Document
Expand Down
Loading

0 comments on commit 42f2186

Please sign in to comment.