feat: access the azure voice system

liou666 · Mar 29, 2023 · 8daad6c · 8daad6c
1 parent f21db55
commit 8daad6c
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 107 deletions.
diff --git a/.env.example b/.env.example
@@ -1,2 +1,8 @@
+# OpenAi Key
 VITE_OPENAI_API_KEY=xxx
+# Network Proxy
 VITE_SERVE_PROXY=xxx
+# Aruze Key
+VITE_SCRIPTION_KEY=dxxx
+# Aruze Region
+VITE_REGION=xxx
diff --git a/index.html b/index.html
@@ -4,7 +4,7 @@
     <meta charset="UTF-8" />
     <link rel="icon" type="image/x-icon" href="/favicon.ico" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <meta http-equiv="Content-Security-Policy" content="script-src 'self' 'unsafe-inline';" />
+    <!-- <meta http-equiv="Content-Security-Policy" content="script-src 'self' 'unsafe-inline';" /> -->
     <title>Polyglot</title>
   </head>
   <body class="font-sans dark:text-white">

diff --git a/package.json b/package.json
@@ -25,6 +25,7 @@
   "dependencies": {
     "@vueuse/core": "^9.13.0",
     "eventsource-parser": "^0.1.0",
+    "microsoft-cognitiveservices-speech-sdk": "^1.26.0",
     "unocss": "^0.50.4",
     "vue": "^3.2.47"
   },

diff --git a/src/components/Content.vue b/src/components/Content.vue
@@ -2,29 +2,24 @@
 import Button from '@/components/widgets/Button.vue'
 import { generateText } from '@/server/api'
 import { useScroll } from '@/hooks'
-import { Recognition, getKey, verifyKey } from '@/utils'
+import { SpeechService, getKey, verifyKey } from '@/utils'
+const { VITE_REGION, VITE_SCRIPTION_KEY } = import.meta.env
 
 // states
 const chatMessages = ref<ChatMessage[]>([])
 const message = ref('')
 const loading = ref(false)
 const text = ref('')
-
-const recognition = new Recognition('en-US')
+const speechService = new SpeechService(VITE_SCRIPTION_KEY, VITE_REGION)
 
 // hooks
 const { el, scrollToBottom } = useScroll()
-const speech = useSpeechSynthesis(text)
-const { start } = useSpeechRecognition()
 
 // effects
 watch(chatMessages.value, () => nextTick(() => scrollToBottom()))
 
 // methods
-function play(content: string) {
-  text.value = content
-  speech.speak()
-}
+
 const roleClass = (role: string) => {
   switch (role) {
     case 'user':
@@ -36,11 +31,21 @@ const roleClass = (role: string) => {
   }
 }
 
-const startTalking = () => {
-  recognition.start()
-  recognition.onResult((value) => {
-    console.log('value', value)
-  })
+const speak = (content: string) => {
+  text.value = content
+  speechService.textToSpeak(content)
+}
+
+const recognize = async () => {
+  loading.value = true
+  try {
+    const text = await speechService.recognizeSpeech()
+    console.log(text)
+    loading.value = false
+  }
+  catch (error) {
+    loading.value = false
+  }
 }
 
 const onSubmit = async () => {
@@ -82,7 +87,7 @@ const onSubmit = async () => {
               {{ item.content }}
             </p>
             <p v-if="item.role === 'assistant'" flex>
-              <span class="bg-gray-100/20  rounded-lg w-4 py-1 px-3 center" @click="play(item.content)">
+              <span class="bg-gray-100/20  rounded-lg w-4 py-1 px-3 center" @click="speak(item.content)">
                 <i icon-btn rotate-90 i-ic:sharp-wifi />
               </span>
               <!-- <span
@@ -99,8 +104,7 @@ const onSubmit = async () => {
     <div class="flex h-10 w-[-webkit-fill-available] mt-1">
       <Button
         mr-1
-        i-carbon:microphon
-        @click="startTalking()"
+        @click="recognize()"
       >
         <i i-carbon:microphone />
       </Button>
@@ -112,7 +116,7 @@ const onSubmit = async () => {
         input-box p-3 flex-1
       >
       <div v-else class="loading-btn">
-        AI Is Thinking...
+        loading...
       </div>
       <Button
         :disabled="loading"

diff --git a/src/utils/speaker.ts b/src/utils/speaker.ts
@@ -1,90 +1,46 @@
-export class Speaker {
-  public utter: SpeechSynthesisUtterance
-  public voices: SpeechSynthesisVoice[] = []
-
-  constructor(option: { lang?: string; pitch?: number; rate?: number; volume?: number; text?: string }) {
-    const {
-      lang = 'zh-CN',
-      pitch = 1,
-      rate = 1,
-      volume = 1,
-      text = '',
-    } = option
-    this.utter = new window.SpeechSynthesisUtterance()
-    this.utter.lang = lang
-    this.utter.pitch = pitch
-    this.utter.rate = rate
-    this.utter.volume = volume
-    this.utter.text = text
-    this.getVoices()
-  }
-
-  getVoices() {
-    window.speechSynthesis.onvoiceschanged = () => {
-      this.voices = window.speechSynthesis.getVoices()
-      if (this.voices.length > 0)
-        this.utter.voice = this.voices[0] // 设置声音来源
-    }
-  }
-
-  // 开始播放当前的语音
-  start() {
-    window.speechSynthesis.speak(this.utter)
-  }
-
-  // 暂停播放
-  pause() {
-    window.speechSynthesis.pause()
-  }
-
-  // 暂停之后继续播放
-  resume() {
-    window.speechSynthesis.resume()
-  }
-
-  // 清空所有播放
-  cancel() {
-    window.speechSynthesis.cancel()
-  }
-
-  // 切换语音的内容
-  change(text: string) {
-    this.utter.text = text
-    window.speechSynthesis.speak(this.utter)
-  }
-}
-
-export class Recognition {
-  public recognition: any
-  public isListening: boolean
-  public result: string
-
-  constructor(lang = 'zh-CN') {
-    this.recognition = new (window.SpeechRecognition || window.webkitSpeechRecognition || window.mozSpeechRecognition || window.msSpeechRecognition)()
-    this.isListening = false
-    this.result = ''
-    this.recognition.lang = lang
-  }
-
-  // 开始语音识别
-  start() {
-    this.isListening = true
-    this.recognition.start()
-  }
-
-  // 结束语音识别
-  stop() {
-    this.isListening = false
-    this.recognition.stop()
-  }
-
-  // 监听语音识别的结果
-  onResult(callback: (result: string) => void) {
-    this.recognition.onresult = (e: any) => {
-      const result = e.results[0][0].transcript
-      this.result = result
-      callback(result)
-    }
+import type { VoiceInfo } from 'microsoft-cognitiveservices-speech-sdk'
+import {
+  AudioConfig,
+  SpeechConfig,
+  SpeechRecognizer,
+  SpeechSynthesizer,
+} from 'microsoft-cognitiveservices-speech-sdk'
+
+export class SpeechService {
+  private recognizer: SpeechRecognizer
+  private synthesizer: SpeechSynthesizer
+  private speechConfig: SpeechConfig
+  constructor(subscriptionKey: string, region: string) {
+    const speechConfig = SpeechConfig.fromSubscription(subscriptionKey, region)
+    speechConfig.speechRecognitionLanguage = 'en-US'
+    speechConfig.speechSynthesisLanguage = 'en-US'
+    speechConfig.speechSynthesisVoiceName = 'en-US-GuyNeural'
+
+    this.speechConfig = speechConfig
+
+    const audioConfig = AudioConfig.fromDefaultMicrophoneInput()
+    this.recognizer = new SpeechRecognizer(this.speechConfig, audioConfig)
+    this.synthesizer = new SpeechSynthesizer(this.speechConfig)
+  }
+
+  public recognizeSpeech(): Promise<string> {
+    return new Promise((resolve, reject) => {
+      this.recognizer.recognizeOnceAsync((result) => {
+        if (result.text)
+          resolve(result.text)
+        else
+          reject(new Error('语音识别失败'))
+      })
+    })
+  }
+
+  public textToSpeak(text: string, voice?: string) {
+    this.speechConfig.speechSynthesisVoiceName = voice || this.speechConfig.speechSynthesisVoiceName
+    this.synthesizer.speakTextAsync(text)
+  }
+
+  public async getVoices(): Promise<VoiceInfo[]> {
+    const res = await this.synthesizer.getVoicesAsync()
+    return res.voices
   }
 }
-