Abstract TTS module

enricoros · zoollcar · Oct 14, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 25, 2024
commit a538cc195a19924d48fc953707b1a1d39e93fdb1
diff --git a/app/api/elevenlabs/speech/route.ts b/app/api/elevenlabs/speech/route.ts
@@ -1,2 +1,2 @@
 export const runtime = 'edge';
-export { elevenLabsHandler as POST } from '~/modules/elevenlabs/elevenlabs.server';
+export { elevenLabsHandler as POST } from '~/modules/tts/vendors/elevenlabs/elevenlabs.server';
diff --git a/pages/info/debug.tsx b/pages/info/debug.tsx
@@ -18,7 +18,8 @@ import { ROUTE_APP_CHAT, ROUTE_INDEX } from '~/common/app.routes';
 import { Release } from '~/common/app.release';
 
 // capabilities access
-import { useCapabilityBrowserSpeechRecognition, useVoiceCapability, useCapabilityTextToImage } from '~/common/components/useCapabilities';
+import { useCapabilityBrowserSpeechRecognition, useCapabilityTextToImage } from '~/common/components/useCapabilities';
+import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
 
 // stores access
 import { getLLMsDebugInfo } from '~/common/stores/llms/store-llms';
@@ -95,7 +96,7 @@ function AppDebug() {
   const cProduct = {
     capabilities: {
       mic: useCapabilityBrowserSpeechRecognition(),
-      elevenLabs: useVoiceCapability(),
+      elevenLabs: useTTSCapability(),
       textToImage: useCapabilityTextToImage(),
     },
     models: getLLMsDebugInfo(),

diff --git a/src/apps/call/CallWizard.tsx b/src/apps/call/CallWizard.tsx
@@ -12,11 +12,13 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded';
 import { animationColorRainbow } from '~/common/util/animUtils';
 import { navigateBack } from '~/common/app.routes';
 import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
-import { useCapabilityBrowserSpeechRecognition, useVoiceCapability } from '~/common/components/useCapabilities';
+import { useCapabilityBrowserSpeechRecognition } from '~/common/components/useCapabilities';
+import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
 import { useChatStore } from '~/common/stores/chat/store-chats';
 import { useUICounter } from '~/common/state/store-ui';
 
 
+
 function StatusCard(props: { icon: React.JSX.Element, hasIssue: boolean, text: string, button?: React.JSX.Element }) {
   return (
     <Card sx={{ width: '100%' }}>
@@ -45,7 +47,7 @@ export function CallWizard(props: { strict?: boolean, conversationId: string | n
 
   // external state
   const recognition = useCapabilityBrowserSpeechRecognition();
-  const synthesis = useVoiceCapability();
+  const synthesis = useTTSCapability();
   const chatIsEmpty = useChatStore(state => {
     if (!props.conversationId)
       return false;

diff --git a/src/apps/call/Telephone.tsx b/src/apps/call/Telephone.tsx
@@ -13,10 +13,10 @@ import { ScrollToBottom } from '~/common/scroll-to-bottom/ScrollToBottom';
 import { ScrollToBottomButton } from '~/common/scroll-to-bottom/ScrollToBottomButton';
 import { useChatLLMDropdown } from '../chat/components/layout-bar/useLLMDropdown';
 
-import { EXPERIMENTAL_speakTextStream } from '~/common/components/useVoiceCapabilities';
+import { EXPERIMENTAL_speakTextStream } from '~/modules/tts/tts.client';
 import { SystemPurposeId, SystemPurposes } from '../../data';
 import { llmStreamingChatGenerate, VChatMessageIn } from '~/modules/llms/llm.client';
-import { useElevenLabsVoiceDropdown } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
+import { TTSSetting } from '~/modules/tts/tts.setting';
 
 import type { OptimaBarControlMethods } from '~/common/layout/optima/bar/OptimaBarDropdown';
 import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
@@ -39,6 +39,7 @@ import { CallStatus } from './components/CallStatus';
 import { useAppCallStore } from './state/store-app-call';
 
 
+
 function CallMenuItems(props: {
   pushToTalk: boolean,
   setPushToTalk: (pushToTalk: boolean) => void,
@@ -48,8 +49,7 @@ function CallMenuItems(props: {
 
   // external state
   const { grayUI, toggleGrayUI } = useAppCallStore();
-  const { voicesDropdown } = useElevenLabsVoiceDropdown(false, !props.override);
-
+
   const handlePushToTalkToggle = () => props.setPushToTalk(!props.pushToTalk);
 
   const handleChangeVoiceToggle = () => props.setOverride(!props.override);
@@ -68,10 +68,10 @@ function CallMenuItems(props: {
       <Switch checked={props.override} onChange={handleChangeVoiceToggle} sx={{ ml: 'auto' }} />
     </MenuItem>
 
-    <MenuItem>
-      <ListItemDecorator>{' '}</ListItemDecorator>
-      {voicesDropdown}
+    <MenuItem sx={{flexWrap: 'wrap'}}>
+      <TTSSetting />
     </MenuItem>
+
 
     <ListDivider />
 

diff --git a/src/apps/chat/AppChat.tsx b/src/apps/chat/AppChat.tsx
@@ -10,7 +10,7 @@ import { FlattenerModal } from '~/modules/aifn/flatten/FlattenerModal';
 import { TradeConfig, TradeModal } from '~/modules/trade/TradeModal';
 import { downloadSingleChat, importConversationsFromFilesAtRest, openConversationsAtRestPicker } from '~/modules/trade/trade.client';
 import { imaginePromptFromTextOrThrow } from '~/modules/aifn/imagine/imaginePromptFromText';
-import { speakText } from '~/common/components/useVoiceCapabilities';
+import { speakText } from '~/modules/tts/tts.client';
 import { useAreBeamsOpen } from '~/modules/beam/store-beam.hooks';
 import { useCapabilityTextToImage } from '~/modules/t2i/t2i.client';
 

diff --git a/src/apps/chat/components/ChatMessageList.tsx b/src/apps/chat/components/ChatMessageList.tsx
@@ -19,7 +19,7 @@ import { getConversation, useChatStore } from '~/common/stores/chat/store-chats'
 import { openFileForAttaching } from '~/common/components/ButtonAttachFiles';
 import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
 import { useBrowserTranslationWarning } from '~/common/components/useIsBrowserTranslating';
-import { useVoiceCapability } from '~/common/components/useCapabilities';
+import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
 import { useChatOverlayStore } from '~/common/chat-overlay/store-perchat_vanilla';
 import { useScrollToBottom } from '~/common/scroll-to-bottom/useScrollToBottom';
 
@@ -30,6 +30,7 @@ import { PersonaSelector } from './persona-selector/PersonaSelector';
 import { useChatAutoSuggestHTMLUI, useChatShowSystemMessages } from '../store-app-chat';
 
 
+
 const stableNoMessages: DMessage[] = [];
 
 /**
@@ -75,7 +76,7 @@ export function ChatMessageList(props: {
     _composerInReferenceToCount: state.inReferenceTo?.length ?? 0,
     ephemerals: state.ephemerals?.length ? state.ephemerals : null,
   })));
-  const { mayWork: isSpeakable } = useVoiceCapability();
+  const { mayWork: isSpeakable } = useTTSCapability();
 
   // derived state
   const { conversationHandler, conversationId, capabilityHasT2I, onConversationBranch, onConversationExecuteHistory, onTextDiagram, onTextImagine, onTextSpeak } = props;

diff --git a/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts b/src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts
@@ -1,4 +1,4 @@
-import { speakText } from '~/modules/elevenlabs/elevenlabs.client';
+import { speakText } from '~/modules/tts/tts.client';
 
 import { isTextContentFragment } from '~/common/stores/chat/chat.fragments';
 

diff --git a/src/apps/chat/store-app-chat.ts b/src/apps/chat/store-app-chat.ts
@@ -1,9 +1,9 @@
 import { create } from 'zustand';
 import { persist } from 'zustand/middleware';
 import { useShallow } from 'zustand/react/shallow';
-import { ASREngineKey, ASREngineList, TTSEngineKey, TTSEngineList } from '~/common/components/useVoiceCapabilities';
 
 import type { DLLMId } from '~/common/stores/llms/llms.types';
+import { ASREngineKey, ASREngineList } from '~/modules/asr/asr.client';
 
 
 export type ChatAutoSpeakType = 'off' | 'firstLine' | 'all';
@@ -52,9 +52,6 @@ interface AppChatStore {
   micTimeoutMs: number;
   setMicTimeoutMs: (micTimeoutMs: number) => void;
 
-  TTSEngine: TTSEngineKey;
-  setTTSEngine: (TTSEngine: TTSEngineKey) => void;
-
   ASREngine: ASREngineKey;
   setASREngine: (ASREngine: ASREngineKey) => void;
 
@@ -121,9 +118,6 @@ const useAppChatStore = create<AppChatStore>()(persist(
     micTimeoutMs: 2000,
     setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }),
 
-    TTSEngine: TTSEngineList[0].key,
-    setTTSEngine: (TTSEngine: TTSEngineKey) => _set({ TTSEngine }),
-
     ASREngine: ASREngineList[0].key,
     setASREngine: (ASREngine: ASREngineKey) => _set({ ASREngine }),
 
@@ -211,10 +205,6 @@ export const useChatMicTimeoutMsValue = (): number =>
 export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] =>
   useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs]));
 
-export const useTTSEngine = (): [TTSEngineKey, (TTSEngine: TTSEngineKey) => void] =>
-  useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine]));
-export const getTTSEngine = () => useAppChatStore.getState().TTSEngine;
-
 export const useASREngine = (): [ASREngineKey, (ASREngine: ASREngineKey) => void] =>
   useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine]));
 

diff --git a/src/apps/settings-modal/SettingsModal.tsx b/src/apps/settings-modal/SettingsModal.tsx
@@ -9,7 +9,6 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded';
 
 import { BrowseSettings } from '~/modules/browse/BrowseSettings';
 import { DallESettings } from '~/modules/t2i/dalle/DallESettings';
-import { ElevenlabsSettings } from '~/modules/elevenlabs/ElevenlabsSettings';
 import { GoogleSearchSettings } from '~/modules/google/GoogleSearchSettings';
 import { ProdiaSettings } from '~/modules/t2i/prodia/ProdiaSettings';
 import { T2ISettings } from '~/modules/t2i/T2ISettings';
@@ -22,9 +21,9 @@ import { AppChatSettingsAI } from './AppChatSettingsAI';
 import { AppChatSettingsUI } from './settings-ui/AppChatSettingsUI';
 import { UxLabsSettings } from './UxLabsSettings';
 import { VoiceSettings } from './VoiceSettings';
-import { BrowserSpeechSettings } from '~/modules/browser/speech-synthesis/BrowserSpeechSettings';
-
-import { useTTSEngine } from 'src/apps/chat/store-app-chat';
+import { useTTSEngine } from '~/modules/tts/useTTSStore';
+import { TTSSetting } from '~/modules/tts/tts.setting';
+import { getName as getTTSEngineName } from '~/modules/tts/tts.client';
 
 
 // styled <AccordionGroup variant='plain'> into a Topics component
@@ -198,12 +197,9 @@ export function SettingsModal(props: {
             <Topic icon='🎙️' title='Voice settings'>
               <VoiceSettings />
             </Topic>
-            {TTSEngine === 'elevenlabs' && <Topic icon='📢' title='ElevenLabs API'>
-              <ElevenlabsSettings />
-            </Topic>}
-            {TTSEngine === 'webspeech' && <Topic icon='📢' title='Web Speech API'>
-              <BrowserSpeechSettings />
-            </Topic>}
+            <Topic icon='📢' title={getTTSEngineName()}>
+              <TTSSetting />
+            </Topic>
           </Topics>
         </TabPanel>
 

diff --git a/src/apps/settings-modal/VoiceSettings.tsx b/src/apps/settings-modal/VoiceSettings.tsx
@@ -1,14 +1,17 @@
 import * as React from 'react';
 
-import { FormControl } from '@mui/joy';
+import { FormControl, Option, Select } from '@mui/joy';
+import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';
 
-import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat';
+import { useASREngine, useChatAutoAI, useChatMicTimeoutMs } from '../chat/store-app-chat';
 
 import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
 import { FormRadioControl } from '~/common/components/forms/FormRadioControl';
 import { LanguageSelect } from '~/common/components/LanguageSelect';
 import { useIsMobile } from '~/common/components/useMatchMedia';
-import { hasVoices, ASREngineList, TTSEngineList, TTSEngineKey } from '~/common/components/useVoiceCapabilities';
+import { ASREngineKey, ASREngineList } from '~/modules/asr/asr.client';
+import { TTSEngineKey, TTSEngineList, useTTSEngine } from '~/modules/tts/useTTSStore';
+import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
 
 export function VoiceSettings() {
   // external state
@@ -23,6 +26,18 @@ export function VoiceSettings() {
   const chatTimeoutValue: string = '' + chatTimeoutMs;
   const setChatTimeoutValue = (value: string) => value && setChatTimeoutMs(parseInt(value));
 
+  const { mayWork: hasVoices } = useTTSCapability();
+
+  const handleTTSChanged = (_event: any, newValue: TTSEngineKey | null) => {
+    if (!newValue) return;
+    setTTSEngine(newValue);
+  };
+
+  const handleASRChanged = (_event: any, newValue: ASREngineKey | null) => {
+    if (!newValue) return;
+    setASREngine(newValue);
+  };
+
   return (
     <>
       {/* LanguageSelect: moved from the UI settings (where it logically belongs), just to group things better from an UX perspective */}
@@ -63,23 +78,44 @@ export function VoiceSettings() {
         onChange={setAutoSpeak}
       />
 
-      <FormRadioControl
-        title="TTS engine"
-        description="Text to speech"
-        tooltip=""
-        options={TTSEngineList.map((i) => ({ value: i.key, label: i.label }))}
-        value={TTSEngine}
-        onChange={setTTSEngine}
-      />
+      <FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
+        <FormLabelStart title="TTS engine" description="Text to speech / voice synthesis" tooltip="" />
 
-      <FormRadioControl
-        title="ASR engine"
-        description="Automatic Speech Recognition"
-        tooltip=""
-        options={ASREngineList.map((i) => ({ value: i.key, label: i.label }))}
-        value={ASREngine}
-        onChange={setASREngine}
-      />
+        <Select
+          value={TTSEngine}
+          onChange={handleTTSChanged}
+          indicator={<KeyboardArrowDownIcon />}
+          slotProps={{
+            root: { sx: { minWidth: 200 } },
+            indicator: { sx: { opacity: 0.5 } },
+          }}
+        >
+          {TTSEngineList.map((i) => (
+            <Option key={i.key} value={i.key}>
+              {i.label}
+            </Option>
+          ))}
+        </Select>
+      </FormControl>
+
+      <FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
+        <FormLabelStart title="ASR engine" description="Automatic Speech Recognition" tooltip="" />
+        <Select
+          value={ASREngine}
+          onChange={handleASRChanged}
+          indicator={<KeyboardArrowDownIcon />}
+          slotProps={{
+            root: { sx: { minWidth: 200 } },
+            indicator: { sx: { opacity: 0.5 } },
+          }}
+        >
+          {ASREngineList.map((i) => (
+            <Option key={i.key} value={i.key}>
+              {i.label}
+            </Option>
+          ))}
+        </Select>
+      </FormControl>
     </>
   );
 }
diff --git a/src/common/components/useCapabilities.ts b/src/common/components/useCapabilities.ts
@@ -21,18 +21,6 @@ export interface CapabilityBrowserSpeechRecognition {
 
 export { browserSpeechRecognitionCapability as useCapabilityBrowserSpeechRecognition } from './speechrecognition/useSpeechRecognition';
 
-
-/// Speech Synthesis
-
-export interface CapabilitySpeechSynthesis {
-  mayWork: boolean;
-  isConfiguredServerSide: boolean;
-  isConfiguredClientSide: boolean;
-}
-
-export { useCapability as useVoiceCapability } from '~/common/components/useVoiceCapabilities';
-
-
 /// Image Generation
 
 export interface TextToImageProvider {