Skip to content

Commit

Permalink
add paddlespeech plugin (#15)
Browse files Browse the repository at this point in the history
* add paddlespeech plugin

* 0.0.16
  • Loading branch information
iftaken authored Jul 8, 2022
1 parent c435a03 commit e8bb63a
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 2 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.15
0.0.16
14 changes: 14 additions & 0 deletions examples/paddlespeech-bot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""paddlespeech bot examples"""
import asyncio
from wechaty import Wechaty
from wechaty_plugin_contrib.contrib.paddlespeech_plugin import (
PaddleSpeechPlugin
)

async def run() -> None:
"""async run method"""
plugin = PaddleSpeechPlugin()
bot = Wechaty().use(plugin)
await bot.start()

asyncio.run(run())
2 changes: 1 addition & 1 deletion src/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = '0.0.15'
VERSION = '0.0.16'
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .plugin import (
PaddleSpeechPlugin
)

__all__ = [
'PaddleSpeechPlugin',
]
80 changes: 80 additions & 0 deletions src/wechaty_plugin_contrib/contrib/paddlespeech_plugin/plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
from typing import Optional

from wechaty import FileBox, Message, MessageType, WechatyPluginOptions, WechatyPlugin
from wechaty_puppet import get_logger

from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.tts.infer import TTSExecutor
from yaml import dump_all
from .util import wav2silk, silk2wav, get_timestmp_string, silk2pcm


class PaddleSpeechPlugin(WechatyPlugin):
"""PaddleSpeech Plugin"""
def __init__(self, options: Optional[WechatyPluginOptions] = None):
super().__init__(options)
self.cache_dir = f'.wechaty/{self.name}'
os.makedirs(self.cache_dir, exist_ok=True)
self.logger = get_logger(self.name, f'{self.cache_dir}/log.log')
self.asr_model = ASRExecutor()
self.tts_model = TTSExecutor()
# first use, download asr & tts model
self.warm_text = "初始化飞桨语音识别与合成"
self.warm_path = os.path.join(self.cache_dir, "warm_up.wav")
self.warm_up()

async def warm_up(self):
# only chinese
await self.tts_model(text=self.warm_text, output=self.warm_path)
await self.asr_model(self.warm_path, force_yes=True)

async def asr(self, talker:str, input_silk:str) -> str:
timestmp = get_timestmp_string()
outwav = os.path.join(self.cache_dir, "asr_" + talker + timestmp + ".wav")
outpcm = os.path.join(self.cache_dir, "pcm_" + talker + timestmp + ".pcm")
trans_result = await silk2wav(input_silk, outwav, sr=16000, out_pcm=outpcm)
if trans_result:
out_wav, _ = trans_result
asr_result = self.asr_model(out_wav, force_yes=True)
else:
asr_result = "语音识别错误,音频转换失败"
return asr_result

def tts(self, talker:str, text) -> str:
timestmp = get_timestmp_string()
outwav = os.path.join(self.cache_dir, "tts_" + talker + timestmp + ".wav")
self.tts_model(text=text, output=outwav)
if os.path.exists(outwav):
outsilk, _, duration = wav2silk(media_path=outwav, out_path=self.cache_dir)
else:
outsilk = None
duration = 0
return outsilk, duration

async def on_message(self, msg: Message) -> None:
"""listen message event"""
if msg.room():
return
talker = msg.talker().name

if msg.type() == MessageType.MESSAGE_TYPE_AUDIO:
# asr
file_box = await msg.to_file_box()
saved_file = os.path.join(self.cache_dir, "silk_"+ talker + "_" + get_timestmp_string()+".silk")
await file_box.to_file(saved_file)
asr_result = self.asr(talker, saved_file)
await msg.talker().say(asr_result)
elif msg.type() == MessageType.MESSAGE_TYPE_TEXT:
# tts
text = msg.text()
outsilk, duration = self.tts(talker, text)
if outsilk:
new_audio_file = FileBox.from_file(outsilk)
new_audio_file.metadata = {
"voiceLength": duration * 1000
}
await msg.talker().say(new_audio_file)
else:
await msg.talker().say("语音合成失败")

40 changes: 40 additions & 0 deletions src/wechaty_plugin_contrib/contrib/paddlespeech_plugin/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os,pilk
from pydub import AudioSegment
import datetime

def get_timestmp_string():
return datetime.datetime.now().strftime("%Y%m%d%H%M%S")


def wav2silk(media_path: str, out_path:str='./') -> str:
media = AudioSegment.from_file(media_path)
pcm_path = os.path.basename(media_path)
pcm_path = os.path.splitext(pcm_path)[0]
silk_path = pcm_path + '.silk'
pcm_path += '.pcm'

pcm_path = os.path.join(out_path, pcm_path)
silk_path = os.path.join(out_path, silk_path)

media.export(pcm_path, 's16le', parameters=['-ar', str(media.frame_rate), '-ac', '1']).close()
duration = pilk.encode(pcm_path, silk_path, pcm_rate=media.frame_rate, tencent=True)
return silk_path, pcm_path, duration

def silk2pcm(input_silk, out_pcm, pcm_rate = 24000):
duration = pilk.decode(input_silk, out_pcm, pcm_rate=pcm_rate)
return duration, out_pcm

def pcm2wav(input_pcm, out_wav, sr):
cmd = f"ffmpeg -y -f s16le -ar {sr} -ac 1 -i {input_pcm} {out_wav}"
r = os.system(cmd)
if r == 0 :
return True
else:
return False

def silk2wav(input_silk,out_wav, sr, out_pcm="temp.pcm"):
duration = pilk.decode(input_silk, pcm=out_pcm, pcm_rate=sr)
if pcm2wav(out_pcm, out_wav, sr):
return out_wav, duration
else:
return None

0 comments on commit e8bb63a

Please sign in to comment.