-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathmicrosoft_tts.py
136 lines (117 loc) · 5.07 KB
/
microsoft_tts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import re
import uuid
from datetime import datetime
import asyncio
import websockets
class MSTTS:
def __init__(self):
pass
# Fix the time to match Americanisms
def _hr_cr(self, hr):
corrected = (hr - 1) % 24
return str(corrected)
# Add zeros in the right places i.e 22:1:5 -> 22:01:05
def _fr(self, input_string):
corr = ""
i = 2 - len(input_string)
while i > 0:
corr += "0"
i -= 1
return corr + input_string
# Generate X-Timestamp all correctly formatted
def _getXTime(self):
now = datetime.now()
return (
self._fr(str(now.year))
+ "-"
+ self._fr(str(now.month))
+ "-"
+ self._fr(str(now.day))
+ "T"
+ self._fr(self._hr_cr(int(now.hour)))
+ ":"
+ self._fr(str(now.minute))
+ ":"
+ self._fr(str(now.second))
+ "."
+ str(now.microsecond)[:3]
+ "Z"
)
# Async function for actually communicating with the websocket
async def transferMsTTSData(self, text, outputPath, voice_name='zh-CN-XiaoxiaoNeural'):
req_id = uuid.uuid4().hex.upper()
# print(req_id)
# TOKEN来源 https://github.com/rany2/edge-tts/blob/master/src/edge_tts/constants.py
# 查看支持声音列表 https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"
WSS_URL = (
"wss://speech.platform.bing.com/consumer/speech/synthesize/"
+ "readaloud/edge/v1?TrustedClientToken="
+ TRUSTED_CLIENT_TOKEN
)
endpoint2 = f"{WSS_URL}&ConnectionId={req_id}"
async with websockets.connect(
endpoint2,
extra_headers={
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
},
) as websocket:
message_1 = (
f"X-Timestamp:{self._getXTime()}\r\n"
"Content-Type:application/json; charset=utf-8\r\n"
"Path:speech.config\r\n\r\n"
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
'"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},'
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
"}}}}\r\n"
)
await websocket.send(message_1)
pitch = 0
rate = 0
ssml_text = f'<speak version="1.0" xml:lang="en-US" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:emo="http://www.w3.org/2009/10/emotionml" xmlns:mstts="http://www.w3.org/2001/mstts"> <voice name="{voice_name}"> <prosody pitch="{pitch}%" rate="{rate}%">{text}</prosody> </voice> </speak>'
message_2 = (
f"X-RequestId:{req_id}\r\n"
"Content-Type:application/ssml+xml\r\n"
f"X-Timestamp:{self._getXTime()}Z\r\n" # This is not a mistake, Microsoft Edge bug.
"Path:ssml\r\n\r\n"
f"{ssml_text}"
)
await websocket.send(message_2)
# Checks for close connection message
end_resp_pat = re.compile("Path:turn.end")
audio_stream = b""
while True:
response = await websocket.recv()
print("\rreceiving...", end="")
# print(response)
# Make sure the message isn't telling us to stop
if re.search(end_resp_pat, str(response)) is None:
# Check if our response is text data or the audio bytes
if isinstance(response, bytes):
# Extract binary data
try:
needle = b"Path:audio\r\n"
start_ind = response.find(needle) + len(needle)
audio_stream += response[start_ind:]
except:
pass
else:
break
with open(f"{outputPath}", "wb") as audio_out:
audio_out.write(audio_stream)
async def mainSeq(self, SSML_text, outputPath, voice_name='zh-CN-XiaoxiaoNeural'):
await self.transferMsTTSData(SSML_text, outputPath, voice_name=voice_name)
def text_to_speech(self, text, tgt_fp, voice_name='zh-CN-XiaoxiaoNeural'):
asyncio.run(self.mainSeq(text, tgt_fp, voice_name=voice_name))
print("completed!")
if __name__ == '__main__':
ttsmodel = MSTTS()
text = '这是一个测试句子'
tgt_fp = "./output.wav"
ttsmodel.text_to_speech(text, tgt_fp)