add paddlespeech plugin (#15)

* add paddlespeech plugin * 0.0.16
wechaty · Jul 8, 2022 · e8bb63a · e8bb63a
1 parent c435a03
commit e8bb63a
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 2 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.0.15
+0.0.16
diff --git a/examples/paddlespeech-bot.py b/examples/paddlespeech-bot.py
@@ -0,0 +1,14 @@
+"""paddlespeech bot examples"""
+import asyncio
+from wechaty import Wechaty
+from wechaty_plugin_contrib.contrib.paddlespeech_plugin import (
+    PaddleSpeechPlugin
+)
+
+async def run() -> None:
+    """async run method"""
+    plugin = PaddleSpeechPlugin()
+    bot = Wechaty().use(plugin)
+    await bot.start()
+
+asyncio.run(run())
diff --git a/src/version.py b/src/version.py
@@ -1 +1 @@
-VERSION = '0.0.15'
+VERSION = '0.0.16'
diff --git a/src/wechaty_plugin_contrib/contrib/paddlespeech_plugin/__init__.py b/src/wechaty_plugin_contrib/contrib/paddlespeech_plugin/__init__.py
@@ -0,0 +1,7 @@
+from .plugin import (
+    PaddleSpeechPlugin
+)
+
+__all__ = [
+    'PaddleSpeechPlugin',
+]
diff --git a/src/wechaty_plugin_contrib/contrib/paddlespeech_plugin/plugin.py b/src/wechaty_plugin_contrib/contrib/paddlespeech_plugin/plugin.py
@@ -0,0 +1,80 @@
+import os
+from typing import Optional
+
+from wechaty import FileBox, Message, MessageType, WechatyPluginOptions, WechatyPlugin
+from wechaty_puppet import get_logger
+
+from paddlespeech.cli.asr.infer import ASRExecutor
+from paddlespeech.cli.tts.infer import TTSExecutor
+from yaml import dump_all
+from .util import wav2silk, silk2wav, get_timestmp_string, silk2pcm
+
+
+class PaddleSpeechPlugin(WechatyPlugin):
+    """PaddleSpeech Plugin"""
+    def __init__(self, options: Optional[WechatyPluginOptions] = None):
+        super().__init__(options)
+        self.cache_dir = f'.wechaty/{self.name}'
+        os.makedirs(self.cache_dir, exist_ok=True)
+        self.logger = get_logger(self.name, f'{self.cache_dir}/log.log')
+        self.asr_model = ASRExecutor()
+        self.tts_model = TTSExecutor()
+        # first use, download asr & tts model
+        self.warm_text = "初始化飞桨语音识别与合成"
+        self.warm_path = os.path.join(self.cache_dir, "warm_up.wav")
+        self.warm_up()
+
+    async def warm_up(self):
+        # only chinese
+        await self.tts_model(text=self.warm_text, output=self.warm_path) 
+        await self.asr_model(self.warm_path, force_yes=True)
+
+    async def asr(self, talker:str, input_silk:str) -> str:
+        timestmp = get_timestmp_string()
+        outwav = os.path.join(self.cache_dir, "asr_" + talker + timestmp  + ".wav")
+        outpcm = os.path.join(self.cache_dir, "pcm_" + talker + timestmp + ".pcm")
+        trans_result = await silk2wav(input_silk, outwav, sr=16000, out_pcm=outpcm)
+        if trans_result:
+            out_wav, _ = trans_result
+            asr_result = self.asr_model(out_wav, force_yes=True)
+        else:
+            asr_result = "语音识别错误，音频转换失败"
+        return asr_result
+
+    def tts(self, talker:str, text) -> str:
+        timestmp = get_timestmp_string()
+        outwav = os.path.join(self.cache_dir, "tts_" + talker + timestmp  + ".wav")
+        self.tts_model(text=text, output=outwav)
+        if os.path.exists(outwav):
+            outsilk, _, duration = wav2silk(media_path=outwav, out_path=self.cache_dir)
+        else:
+            outsilk = None
+            duration = 0
+        return outsilk, duration        
+
+    async def on_message(self, msg: Message) -> None:
+        """listen message event"""
+        if msg.room():
+            return
+        talker = msg.talker().name
+
+        if msg.type() == MessageType.MESSAGE_TYPE_AUDIO:
+            # asr
+            file_box = await msg.to_file_box()
+            saved_file = os.path.join(self.cache_dir, "silk_"+ talker + "_" + get_timestmp_string()+".silk")
+            await file_box.to_file(saved_file)
+            asr_result = self.asr(talker, saved_file)
+            await msg.talker().say(asr_result)
+        elif msg.type() == MessageType.MESSAGE_TYPE_TEXT:
+            # tts
+            text = msg.text()
+            outsilk, duration = self.tts(talker, text)
+            if outsilk:
+                new_audio_file = FileBox.from_file(outsilk)
+                new_audio_file.metadata = {
+                    "voiceLength": duration * 1000
+                }
+                await msg.talker().say(new_audio_file)
+            else:
+                await msg.talker().say("语音合成失败")
+
diff --git a/src/wechaty_plugin_contrib/contrib/paddlespeech_plugin/util.py b/src/wechaty_plugin_contrib/contrib/paddlespeech_plugin/util.py
@@ -0,0 +1,40 @@
+import os,pilk
+from pydub import AudioSegment
+import datetime
+
+def get_timestmp_string():
+    return datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+
+
+def wav2silk(media_path: str, out_path:str='./') -> str:
+    media = AudioSegment.from_file(media_path)
+    pcm_path = os.path.basename(media_path)
+    pcm_path = os.path.splitext(pcm_path)[0]
+    silk_path = pcm_path + '.silk'
+    pcm_path += '.pcm'
+
+    pcm_path = os.path.join(out_path, pcm_path)
+    silk_path = os.path.join(out_path, silk_path)
+
+    media.export(pcm_path, 's16le', parameters=['-ar', str(media.frame_rate), '-ac', '1']).close()
+    duration = pilk.encode(pcm_path, silk_path, pcm_rate=media.frame_rate, tencent=True)
+    return silk_path, pcm_path, duration
+
+def silk2pcm(input_silk, out_pcm, pcm_rate = 24000):
+    duration = pilk.decode(input_silk, out_pcm, pcm_rate=pcm_rate)
+    return duration, out_pcm
+
+def pcm2wav(input_pcm, out_wav, sr):
+    cmd = f"ffmpeg -y -f s16le -ar {sr} -ac 1 -i {input_pcm} {out_wav}"
+    r = os.system(cmd)
+    if r == 0 :
+        return True
+    else:
+        return False
+
+def silk2wav(input_silk,out_wav, sr, out_pcm="temp.pcm"):
+    duration = pilk.decode(input_silk, pcm=out_pcm, pcm_rate=sr)
+    if pcm2wav(out_pcm, out_wav, sr):
+        return out_wav, duration
+    else:
+        return None