fix: tap voice in play to end playback (#14) (#38)

liou666 · May 6, 2023 · 6d6cf10 · 6d6cf10
1 parent 4851f7c
commit 6d6cf10
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 58 deletions.
diff --git a/src/config.ts b/src/config.ts
@@ -224,6 +224,43 @@ export const azureRegions = [
   'westus2',
 ]
 
+export const voiceStyleMap = {
+  'advertisement_upbeat': '产品推广',
+  'affectionate': '亲切',
+  'angry': '生气',
+  'assistant': '数字助理',
+  'calm': '冷静',
+  'chat': '轻松',
+  'cheerful': '愉悦',
+  'customerservice': '热情',
+  'depressed': '沮丧',
+  'disgruntled': '抱怨',
+  'documentary-narration': '记录片',
+  'embarrassed': '犹豫',
+  'empathetic': '关心',
+  'envious': '钦佩',
+  'excited': '乐观',
+  'fearful': '不安',
+  'friendly': '友好',
+  'gentle': '温和',
+  'hopeful': '温暖',
+  'lyrical': '感伤',
+  'narration-professional': '客观',
+  'narration-relaxed': '舒缓',
+  'newscast': '新闻播报',
+  'newscast-casual': '随意新闻',
+  'newscast-formal': '权威新闻',
+  'poetry-reading': '诗歌',
+  'sad': '悲伤',
+  'serious': '严肃',
+  'shouting': '大喊',
+  'sports_commentary': '轻松体育赛事',
+  'sports_commentary_excited': '紧张体育赛事',
+  'whispering': '柔和',
+  'terrified': '害怕',
+  'unfriendly': '无情',
+} as Record<string, string>
+
 export const openaiModels = [
   'gpt-3.5-turbo',
   'gpt-3.5-turbo-0301',

diff --git a/src/hooks/useSpeechService.ts b/src/hooks/useSpeechService.ts
@@ -2,7 +2,6 @@ import type { VoiceInfo } from 'microsoft-cognitiveservices-speech-sdk'
 import {
   AudioConfig,
   CancellationErrorCode,
-  ResultReason,
   SpeakerAudioDestination,
   SpeechConfig,
   SpeechRecognizer,
@@ -52,6 +51,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
 
   // const isFetchAllVoices = ref(false) // 是否在请求所有语音列表
   const rate = ref(1) // 语速 (0,2]
+  const style = ref('Neural') // 情感
 
   let mediaRecorder: MediaRecorder | null
   const chunks: Blob[] = []
@@ -61,35 +61,15 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
 
   const recognizer = ref<SpeechRecognizer>(new SpeechRecognizer(speechConfig.value))
   const synthesizer = ref<SpeechSynthesizer>(new SpeechSynthesizer(speechConfig.value))
-
   // 引入变量，触发 SpeechSynthesizer 实例的重新创建
   const count = ref(0)
-
+  const player = ref(new SpeakerAudioDestination())
   watch([language, voiceName, count, azureKey, azureRegion, ttsPassword], ([lang, voice]) => {
     speechConfig.value = SpeechConfig.fromSubscription(resultAzureKey.value, resultAzureRegion.value)
     speechConfig.value.speechRecognitionLanguage = lang
     speechConfig.value.speechSynthesisLanguage = lang
     speechConfig.value.speechSynthesisVoiceName = voice
     console.log(lang, voice)
-
-    // 通过playback结束事件来判断播放结束
-    const player = new SpeakerAudioDestination()
-    player.onAudioStart = function (_) {
-      if (isSynthesError.value) return
-      isPlaying.value = true
-      isPlayend.value = false
-      console.log('playback started')
-    }
-    player.onAudioEnd = function (_) {
-      console.log('playback finished')
-      isPlaying.value = false
-      isPlayend.value = true
-    }
-
-    const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
-    const audioConfiga = AudioConfig.fromSpeakerOutput(player)
-    recognizer.value = new SpeechRecognizer(speechConfig.value, audioConfig)
-    synthesizer.value = new SpeechSynthesizer(speechConfig.value, audioConfiga)
   }, {
     immediate: true,
   })
@@ -103,6 +83,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
     mediaRecorder = new MediaRecorder(stream)
 
     mediaRecorder.ondataavailable = (e) => {
+      console.log(chunks, 'c')
       chunks.push(e.data)
     }
 
@@ -117,6 +98,9 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
   }
 
   const startRecognizeSpeech = async (cb?: (text: string) => void) => {
+    const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
+    recognizer.value = new SpeechRecognizer(speechConfig.value, audioConfig)
+
     isRecognizReadying.value = true
 
     recognizer.value.canceled = () => {
@@ -143,7 +127,6 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
       isRecognizReadying.value = false
       isRecognizing.value = false
     }
-
     recognizer.value.startContinuousRecognitionAsync(async () => {
       await audioRecorder()
       isRecognizing.value = true
@@ -160,8 +143,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
 
   // 停止语音识别
   const stopRecognizeSpeech = (): Promise<void> => {
-    mediaRecorder!.stop()
-
+    mediaRecorder?.stop()
     isRecognizReadying.value = false
     return new Promise((resolve, reject) => {
       recognizer.value.stopContinuousRecognitionAsync(() => {
@@ -211,18 +193,23 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
     })
   }
 
-  const ssmlToSpeak = async (text: string, { voice, voiceRate, lang }: { voice?: string; voiceRate?: number; lang?: string } = {}) => {
+  const ssmlToSpeak = async (text: string, { voice, voiceRate, lang, voiceStyle }: { voice?: string; voiceRate?: number; lang?: string; voiceStyle?: string } = {}) => {
+    applySynthesizerConfiguration()
+
     isSynthesizing.value = true
     isSynthesError.value = false
     const targetLang = lang || speechConfig.value.speechSynthesisLanguage
     const targetVoice = voice || speechConfig.value.speechSynthesisVoiceName
     const targetRate = voiceRate || rate.value
+    const targetFeel = voiceStyle || style.value
 
     const ssml = `
-    <speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="${targetLang}">
+    <speak version="1.0"  xmlns:mstts="https://www.w3.org/2001/mstts" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="${targetLang}">
       <voice name="${targetVoice}">
         <prosody rate="${targetRate}">
-          ${text}
+          <mstts:express-as style="${targetFeel}" styledegree="1.5">
+            ${text}
+          </mstts:express-as>
         </prosody>
       </voice>
     </speak>`
@@ -274,6 +261,25 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
     return res.voices
   }
 
+  function applySynthesizerConfiguration() {
+    // 通过playback结束事件来判断播放结束
+    player.value = new SpeakerAudioDestination()
+    player.value.onAudioStart = function (_) {
+      if (isSynthesError.value) return
+      isPlaying.value = true
+      isPlayend.value = false
+      console.log('playback started.....')
+    }
+    player.value.onAudioEnd = function (_) {
+      console.log('playback finished....')
+      isPlaying.value = false
+      isPlayend.value = true
+    }
+
+    const speakConfig = AudioConfig.fromSpeakerOutput(player.value)
+    synthesizer.value = new SpeechSynthesizer(speechConfig.value, speakConfig)
+  }
+
   return {
     languages,
     language,
@@ -292,6 +298,8 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
     allVoices,
     isSynthesizing,
     rate,
+    style,
     audioBlob,
+    player,
   }
 }