Skip to content

Commit

Permalink
fix: tap voice in play to end playback (#14) (#38)
Browse files Browse the repository at this point in the history
  • Loading branch information
liou666 committed May 6, 2023
1 parent 4851f7c commit 6d6cf10
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 58 deletions.
37 changes: 37 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,43 @@ export const azureRegions = [
'westus2',
]

export const voiceStyleMap = {
'advertisement_upbeat': '产品推广',
'affectionate': '亲切',
'angry': '生气',
'assistant': '数字助理',
'calm': '冷静',
'chat': '轻松',
'cheerful': '愉悦',
'customerservice': '热情',
'depressed': '沮丧',
'disgruntled': '抱怨',
'documentary-narration': '记录片',
'embarrassed': '犹豫',
'empathetic': '关心',
'envious': '钦佩',
'excited': '乐观',
'fearful': '不安',
'friendly': '友好',
'gentle': '温和',
'hopeful': '温暖',
'lyrical': '感伤',
'narration-professional': '客观',
'narration-relaxed': '舒缓',
'newscast': '新闻播报',
'newscast-casual': '随意新闻',
'newscast-formal': '权威新闻',
'poetry-reading': '诗歌',
'sad': '悲伤',
'serious': '严肃',
'shouting': '大喊',
'sports_commentary': '轻松体育赛事',
'sports_commentary_excited': '紧张体育赛事',
'whispering': '柔和',
'terrified': '害怕',
'unfriendly': '无情',
} as Record<string, string>

export const openaiModels = [
'gpt-3.5-turbo',
'gpt-3.5-turbo-0301',
Expand Down
64 changes: 36 additions & 28 deletions src/hooks/useSpeechService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import type { VoiceInfo } from 'microsoft-cognitiveservices-speech-sdk'
import {
AudioConfig,
CancellationErrorCode,
ResultReason,
SpeakerAudioDestination,
SpeechConfig,
SpeechRecognizer,
Expand Down Expand Up @@ -52,6 +51,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z

// const isFetchAllVoices = ref(false) // 是否在请求所有语音列表
const rate = ref(1) // 语速 (0,2]
const style = ref('Neural') // 情感

let mediaRecorder: MediaRecorder | null
const chunks: Blob[] = []
Expand All @@ -61,35 +61,15 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z

const recognizer = ref<SpeechRecognizer>(new SpeechRecognizer(speechConfig.value))
const synthesizer = ref<SpeechSynthesizer>(new SpeechSynthesizer(speechConfig.value))

// 引入变量,触发 SpeechSynthesizer 实例的重新创建
const count = ref(0)

const player = ref(new SpeakerAudioDestination())
watch([language, voiceName, count, azureKey, azureRegion, ttsPassword], ([lang, voice]) => {
speechConfig.value = SpeechConfig.fromSubscription(resultAzureKey.value, resultAzureRegion.value)
speechConfig.value.speechRecognitionLanguage = lang
speechConfig.value.speechSynthesisLanguage = lang
speechConfig.value.speechSynthesisVoiceName = voice
console.log(lang, voice)

// 通过playback结束事件来判断播放结束
const player = new SpeakerAudioDestination()
player.onAudioStart = function (_) {
if (isSynthesError.value) return
isPlaying.value = true
isPlayend.value = false
console.log('playback started')
}
player.onAudioEnd = function (_) {
console.log('playback finished')
isPlaying.value = false
isPlayend.value = true
}

const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
const audioConfiga = AudioConfig.fromSpeakerOutput(player)
recognizer.value = new SpeechRecognizer(speechConfig.value, audioConfig)
synthesizer.value = new SpeechSynthesizer(speechConfig.value, audioConfiga)
}, {
immediate: true,
})
Expand All @@ -103,6 +83,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
mediaRecorder = new MediaRecorder(stream)

mediaRecorder.ondataavailable = (e) => {
console.log(chunks, 'c')
chunks.push(e.data)
}

Expand All @@ -117,6 +98,9 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
}

const startRecognizeSpeech = async (cb?: (text: string) => void) => {
const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
recognizer.value = new SpeechRecognizer(speechConfig.value, audioConfig)

isRecognizReadying.value = true

recognizer.value.canceled = () => {
Expand All @@ -143,7 +127,6 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
isRecognizReadying.value = false
isRecognizing.value = false
}

recognizer.value.startContinuousRecognitionAsync(async () => {
await audioRecorder()
isRecognizing.value = true
Expand All @@ -160,8 +143,7 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z

// 停止语音识别
const stopRecognizeSpeech = (): Promise<void> => {
mediaRecorder!.stop()

mediaRecorder?.stop()
isRecognizReadying.value = false
return new Promise((resolve, reject) => {
recognizer.value.stopContinuousRecognitionAsync(() => {
Expand Down Expand Up @@ -211,18 +193,23 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
})
}

const ssmlToSpeak = async (text: string, { voice, voiceRate, lang }: { voice?: string; voiceRate?: number; lang?: string } = {}) => {
const ssmlToSpeak = async (text: string, { voice, voiceRate, lang, voiceStyle }: { voice?: string; voiceRate?: number; lang?: string; voiceStyle?: string } = {}) => {
applySynthesizerConfiguration()

isSynthesizing.value = true
isSynthesError.value = false
const targetLang = lang || speechConfig.value.speechSynthesisLanguage
const targetVoice = voice || speechConfig.value.speechSynthesisVoiceName
const targetRate = voiceRate || rate.value
const targetFeel = voiceStyle || style.value

const ssml = `
<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="${targetLang}">
<speak version="1.0" xmlns:mstts="https://www.w3.org/2001/mstts" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="${targetLang}">
<voice name="${targetVoice}">
<prosody rate="${targetRate}">
${text}
<mstts:express-as style="${targetFeel}" styledegree="1.5">
${text}
</mstts:express-as>
</prosody>
</voice>
</speak>`
Expand Down Expand Up @@ -274,6 +261,25 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
return res.voices
}

function applySynthesizerConfiguration() {
// 通过playback结束事件来判断播放结束
player.value = new SpeakerAudioDestination()
player.value.onAudioStart = function (_) {
if (isSynthesError.value) return
isPlaying.value = true
isPlayend.value = false
console.log('playback started.....')
}
player.value.onAudioEnd = function (_) {
console.log('playback finished....')
isPlaying.value = false
isPlayend.value = true
}

const speakConfig = AudioConfig.fromSpeakerOutput(player.value)
synthesizer.value = new SpeechSynthesizer(speechConfig.value, speakConfig)
}

return {
languages,
language,
Expand All @@ -292,6 +298,8 @@ export const useSpeechService = ({ langs = <const>['fr-FR', 'ja-JP', 'en-US', 'z
allVoices,
isSynthesizing,
rate,
style,
audioBlob,
player,
}
}
Loading

0 comments on commit 6d6cf10

Please sign in to comment.