From 3db452ef71940d8cc0d0c4b76ab08955b0738d2a Mon Sep 17 00:00:00 2001 From: wanggang Date: Wed, 8 Mar 2023 15:22:46 +0800 Subject: [PATCH] [voice] using baidu service to gen reply voice --- README.md | 6 +++--- channel/wechat/wechat_channel.py | 19 +++++++------------ common/tmp_dir.py | 20 ++++++++++++++++++++ config-template.json | 1 + voice/baidu/baidu_voice.py | 16 +++++++++++++++- voice/google/google_voice.py | 3 ++- 6 files changed, 48 insertions(+), 17 deletions(-) create mode 100644 common/tmp_dir.py diff --git a/README.md b/README.md index 93660e861..8fe3b30d9 100644 --- a/README.md +++ b/README.md @@ -72,9 +72,8 @@ cd chatgpt-on-wechat/ pip3 install itchat-uos==1.5.0.dev0 pip3 install --upgrade openai -默认使用openai的whisper-1模型 如果使用百度的语音识别,需要安装百度的pythonSDK -pip3 install baidu-aip +pip3 install baidu-aip chardet 如果使用google的语音识别,需要安装speech_recognition和依赖的ffmpeg和espeak pip3 install SpeechRecognition --在MacOS中安装ffmpeg,brew install ffmpeg espeak @@ -122,7 +121,8 @@ cp config-template.json config.json + 可选配置: `group_name_keyword_white_list`配置项支持模糊匹配群名称,`group_chat_keyword`配置项则支持模糊匹配群消息内容,用法与上述两个配置项相同。(Contributed by [evolay](https://github.com/evolay)) **3.语音识别** -+ 配置`speech_recognition=true`开启语音识别 ++ 配置`speech_recognition=true`开启语音识别,默认使用openai的whisper模型 ++ 配置`voice_reply_voice=true`语音回复语音,但是需要配置对应语音合成平台的key **4.其他配置** diff --git a/channel/wechat/wechat_channel.py b/channel/wechat/wechat_channel.py index 0f2061394..b861e358c 100644 --- a/channel/wechat/wechat_channel.py +++ b/channel/wechat/wechat_channel.py @@ -4,14 +4,13 @@ wechat channel """ -import os -import pathlib import itchat import json from itchat.content import * from channel.channel import Channel from concurrent.futures import ThreadPoolExecutor from common.log import logger +from common.tmp_dir import TmpDir from config import conf import requests import io @@ -38,12 +37,8 @@ def handler_single_voice(msg): class WechatChannel(Channel): - tmpFilePath = pathlib.Path('./tmp/') - def __init__(self): - pathExists = os.path.exists(self.tmpFilePath) - if not pathExists and conf().get('speech_recognition') == True: - os.makedirs(self.tmpFilePath) + pass def startup(self): # login by scan QRCode @@ -59,17 +54,17 @@ def handle_voice(self, msg): thread_pool.submit(self._do_handle_voice, msg) def _do_handle_voice(self, msg): - fileName = self.tmpFilePath+msg['FileName'] + fileName = TmpDir().path() + msg['FileName'] msg.download(fileName) content = super().build_voice_to_text(fileName) - self._handle_single_msg(msg, content, False) + self._handle_single_msg(msg, content, conf().get('voice_reply_voice')) def handle_text(self, msg): logger.debug("[WX]receive text msg: " + json.dumps(msg, ensure_ascii=False)) content = msg['Text'] self._handle_single_msg(msg, content, False) - def _handle_single_msg(self, msg, content, is_voice): + def _handle_single_msg(self, msg, content, reply_voice=False): from_user_id = msg['FromUserName'] to_user_id = msg['ToUserName'] # 接收人id other_user_id = msg['User']['UserName'] # 对手方id @@ -88,7 +83,7 @@ def _handle_single_msg(self, msg, content, is_voice): if img_match_prefix: content = content.split(img_match_prefix, 1)[1].strip() thread_pool.submit(self._do_send_img, content, from_user_id) - elif is_voice: + elif reply_voice: thread_pool.submit(self._do_send_voice, content, from_user_id) else : thread_pool.submit(self._do_send_text, content, from_user_id) @@ -101,7 +96,7 @@ def _handle_single_msg(self, msg, content, is_voice): if img_match_prefix: content = content.split(img_match_prefix, 1)[1].strip() thread_pool.submit(self._do_send_img, content, to_user_id) - elif is_voice: + elif reply_voice: thread_pool.submit(self._do_send_voice, content, to_user_id) else: thread_pool.submit(self._do_send_text, content, to_user_id) diff --git a/common/tmp_dir.py b/common/tmp_dir.py new file mode 100644 index 000000000..1738022ca --- /dev/null +++ b/common/tmp_dir.py @@ -0,0 +1,20 @@ + +import os +import pathlib +from config import conf + + +class TmpDir(object): + """A temporary directory that is deleted when the object is destroyed. + """ + + tmpFilePath = pathlib.Path('./tmp/') + + def __init__(self): + pathExists = os.path.exists(self.tmpFilePath) + if not pathExists and conf().get('speech_recognition') == True: + os.makedirs(self.tmpFilePath) + + def path(self): + return str(self.tmpFilePath) + '/' + \ No newline at end of file diff --git a/config-template.json b/config-template.json index f7549d470..7e693f6f9 100644 --- a/config-template.json +++ b/config-template.json @@ -8,6 +8,7 @@ "image_create_prefix": ["画", "看", "找"], "conversation_max_tokens": 1000, "speech_recognition": false, + "voice_reply_voice": false, "baidu_app_id": "YOUR BAIDU APP ID", "baidu_api_key": "YOUR BAIDU API KEY", "baidu_secret_key": "YOUR BAIDU SERVICE KEY", diff --git a/voice/baidu/baidu_voice.py b/voice/baidu/baidu_voice.py index 8534c2ba1..d99db37dc 100644 --- a/voice/baidu/baidu_voice.py +++ b/voice/baidu/baidu_voice.py @@ -2,7 +2,10 @@ """ baidu voice service """ +import time from aip import AipSpeech +from common.log import logger +from common.tmp_dir import TmpDir from voice.voice import Voice from config import conf @@ -19,4 +22,15 @@ def voiceToText(self, voice_file): pass def textToVoice(self, text): - pass + result = self.client.synthesis(text, 'zh', 1, { + 'spd': 5, 'pit': 5, 'vol': 5, 'per': 111 + }) + if not isinstance(result, dict): + fileName = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3' + with open(fileName, 'wb') as f: + f.write(result) + logger.info('[Baidu] textToVoice text={} voice file name={}'.format(text, fileName)) + return fileName + else: + logger.error('[Baidu] textToVoice error={}'.format(result)) + return None diff --git a/voice/google/google_voice.py b/voice/google/google_voice.py index 3fff9d7e6..8e339f2bc 100644 --- a/voice/google/google_voice.py +++ b/voice/google/google_voice.py @@ -9,6 +9,7 @@ import speech_recognition import pyttsx3 from common.log import logger +from common.tmp_dir import TmpDir from voice.voice import Voice @@ -42,7 +43,7 @@ def voiceToText(self, voice_file): return "抱歉,无法连接到 Google 语音识别服务;{0}".format(e) def textToVoice(self, text): - textFile = self.tmpFilePath + '语音回复_' + str(int(time.time())) + '.mp3' + textFile = TmpDir().path() + '语音回复_' + str(int(time.time())) + '.mp3' self.engine.save_to_file(text, textFile) self.engine.runAndWait() logger.info(