Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Web Speech API #661

Open
wants to merge 4 commits into
base: v2-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Abstract TTS module
  • Loading branch information
zoollcar committed Oct 25, 2024
commit a538cc195a19924d48fc953707b1a1d39e93fdb1
2 changes: 1 addition & 1 deletion app/api/elevenlabs/speech/route.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
export const runtime = 'edge';
export { elevenLabsHandler as POST } from '~/modules/elevenlabs/elevenlabs.server';
export { elevenLabsHandler as POST } from '~/modules/tts/vendors/elevenlabs/elevenlabs.server';
5 changes: 3 additions & 2 deletions pages/info/debug.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ import { ROUTE_APP_CHAT, ROUTE_INDEX } from '~/common/app.routes';
import { Release } from '~/common/app.release';

// capabilities access
import { useCapabilityBrowserSpeechRecognition, useVoiceCapability, useCapabilityTextToImage } from '~/common/components/useCapabilities';
import { useCapabilityBrowserSpeechRecognition, useCapabilityTextToImage } from '~/common/components/useCapabilities';
import { useTTSCapability } from '~/modules/tts/tts.client.hooks';

// stores access
import { getLLMsDebugInfo } from '~/common/stores/llms/store-llms';
Expand Down Expand Up @@ -95,7 +96,7 @@ function AppDebug() {
const cProduct = {
capabilities: {
mic: useCapabilityBrowserSpeechRecognition(),
elevenLabs: useVoiceCapability(),
elevenLabs: useTTSCapability(),
textToImage: useCapabilityTextToImage(),
},
models: getLLMsDebugInfo(),
Expand Down
6 changes: 4 additions & 2 deletions src/apps/call/CallWizard.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded';
import { animationColorRainbow } from '~/common/util/animUtils';
import { navigateBack } from '~/common/app.routes';
import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
import { useCapabilityBrowserSpeechRecognition, useVoiceCapability } from '~/common/components/useCapabilities';
import { useCapabilityBrowserSpeechRecognition } from '~/common/components/useCapabilities';
import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
import { useChatStore } from '~/common/stores/chat/store-chats';
import { useUICounter } from '~/common/state/store-ui';



function StatusCard(props: { icon: React.JSX.Element, hasIssue: boolean, text: string, button?: React.JSX.Element }) {
return (
<Card sx={{ width: '100%' }}>
Expand Down Expand Up @@ -45,7 +47,7 @@ export function CallWizard(props: { strict?: boolean, conversationId: string | n

// external state
const recognition = useCapabilityBrowserSpeechRecognition();
const synthesis = useVoiceCapability();
const synthesis = useTTSCapability();
const chatIsEmpty = useChatStore(state => {
if (!props.conversationId)
return false;
Expand Down
14 changes: 7 additions & 7 deletions src/apps/call/Telephone.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ import { ScrollToBottom } from '~/common/scroll-to-bottom/ScrollToBottom';
import { ScrollToBottomButton } from '~/common/scroll-to-bottom/ScrollToBottomButton';
import { useChatLLMDropdown } from '../chat/components/layout-bar/useLLMDropdown';

import { EXPERIMENTAL_speakTextStream } from '~/common/components/useVoiceCapabilities';
import { EXPERIMENTAL_speakTextStream } from '~/modules/tts/tts.client';
import { SystemPurposeId, SystemPurposes } from '../../data';
import { llmStreamingChatGenerate, VChatMessageIn } from '~/modules/llms/llm.client';
import { useElevenLabsVoiceDropdown } from '~/modules/elevenlabs/useElevenLabsVoiceDropdown';
import { TTSSetting } from '~/modules/tts/tts.setting';

import type { OptimaBarControlMethods } from '~/common/layout/optima/bar/OptimaBarDropdown';
import { AudioPlayer } from '~/common/util/audio/AudioPlayer';
Expand All @@ -39,6 +39,7 @@ import { CallStatus } from './components/CallStatus';
import { useAppCallStore } from './state/store-app-call';



function CallMenuItems(props: {
pushToTalk: boolean,
setPushToTalk: (pushToTalk: boolean) => void,
Expand All @@ -48,8 +49,7 @@ function CallMenuItems(props: {

// external state
const { grayUI, toggleGrayUI } = useAppCallStore();
const { voicesDropdown } = useElevenLabsVoiceDropdown(false, !props.override);


const handlePushToTalkToggle = () => props.setPushToTalk(!props.pushToTalk);

const handleChangeVoiceToggle = () => props.setOverride(!props.override);
Expand All @@ -68,10 +68,10 @@ function CallMenuItems(props: {
<Switch checked={props.override} onChange={handleChangeVoiceToggle} sx={{ ml: 'auto' }} />
</MenuItem>

<MenuItem>
<ListItemDecorator>{' '}</ListItemDecorator>
{voicesDropdown}
<MenuItem sx={{flexWrap: 'wrap'}}>
<TTSSetting />
</MenuItem>


<ListDivider />

Expand Down
2 changes: 1 addition & 1 deletion src/apps/chat/AppChat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { FlattenerModal } from '~/modules/aifn/flatten/FlattenerModal';
import { TradeConfig, TradeModal } from '~/modules/trade/TradeModal';
import { downloadSingleChat, importConversationsFromFilesAtRest, openConversationsAtRestPicker } from '~/modules/trade/trade.client';
import { imaginePromptFromTextOrThrow } from '~/modules/aifn/imagine/imaginePromptFromText';
import { speakText } from '~/common/components/useVoiceCapabilities';
import { speakText } from '~/modules/tts/tts.client';
import { useAreBeamsOpen } from '~/modules/beam/store-beam.hooks';
import { useCapabilityTextToImage } from '~/modules/t2i/t2i.client';

Expand Down
5 changes: 3 additions & 2 deletions src/apps/chat/components/ChatMessageList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import { getConversation, useChatStore } from '~/common/stores/chat/store-chats'
import { openFileForAttaching } from '~/common/components/ButtonAttachFiles';
import { optimaOpenPreferences } from '~/common/layout/optima/useOptima';
import { useBrowserTranslationWarning } from '~/common/components/useIsBrowserTranslating';
import { useVoiceCapability } from '~/common/components/useCapabilities';
import { useTTSCapability } from '~/modules/tts/tts.client.hooks';
import { useChatOverlayStore } from '~/common/chat-overlay/store-perchat_vanilla';
import { useScrollToBottom } from '~/common/scroll-to-bottom/useScrollToBottom';

Expand All @@ -30,6 +30,7 @@ import { PersonaSelector } from './persona-selector/PersonaSelector';
import { useChatAutoSuggestHTMLUI, useChatShowSystemMessages } from '../store-app-chat';



const stableNoMessages: DMessage[] = [];

/**
Expand Down Expand Up @@ -75,7 +76,7 @@ export function ChatMessageList(props: {
_composerInReferenceToCount: state.inReferenceTo?.length ?? 0,
ephemerals: state.ephemerals?.length ? state.ephemerals : null,
})));
const { mayWork: isSpeakable } = useVoiceCapability();
const { mayWork: isSpeakable } = useTTSCapability();

// derived state
const { conversationHandler, conversationId, capabilityHasT2I, onConversationBranch, onConversationExecuteHistory, onTextDiagram, onTextImagine, onTextSpeak } = props;
Expand Down
2 changes: 1 addition & 1 deletion src/apps/chat/editors/persona/PersonaChatMessageSpeak.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { speakText } from '~/modules/elevenlabs/elevenlabs.client';
import { speakText } from '~/modules/tts/tts.client';

import { isTextContentFragment } from '~/common/stores/chat/chat.fragments';

Expand Down
12 changes: 1 addition & 11 deletions src/apps/chat/store-app-chat.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { create } from 'zustand';
import { persist } from 'zustand/middleware';
import { useShallow } from 'zustand/react/shallow';
import { ASREngineKey, ASREngineList, TTSEngineKey, TTSEngineList } from '~/common/components/useVoiceCapabilities';

import type { DLLMId } from '~/common/stores/llms/llms.types';
import { ASREngineKey, ASREngineList } from '~/modules/asr/asr.client';


export type ChatAutoSpeakType = 'off' | 'firstLine' | 'all';
Expand Down Expand Up @@ -52,9 +52,6 @@ interface AppChatStore {
micTimeoutMs: number;
setMicTimeoutMs: (micTimeoutMs: number) => void;

TTSEngine: TTSEngineKey;
setTTSEngine: (TTSEngine: TTSEngineKey) => void;

ASREngine: ASREngineKey;
setASREngine: (ASREngine: ASREngineKey) => void;

Expand Down Expand Up @@ -121,9 +118,6 @@ const useAppChatStore = create<AppChatStore>()(persist(
micTimeoutMs: 2000,
setMicTimeoutMs: (micTimeoutMs: number) => _set({ micTimeoutMs }),

TTSEngine: TTSEngineList[0].key,
setTTSEngine: (TTSEngine: TTSEngineKey) => _set({ TTSEngine }),

ASREngine: ASREngineList[0].key,
setASREngine: (ASREngine: ASREngineKey) => _set({ ASREngine }),

Expand Down Expand Up @@ -211,10 +205,6 @@ export const useChatMicTimeoutMsValue = (): number =>
export const useChatMicTimeoutMs = (): [number, (micTimeoutMs: number) => void] =>
useAppChatStore(useShallow(state => [state.micTimeoutMs, state.setMicTimeoutMs]));

export const useTTSEngine = (): [TTSEngineKey, (TTSEngine: TTSEngineKey) => void] =>
useAppChatStore(useShallow(state => [state.TTSEngine, state.setTTSEngine]));
export const getTTSEngine = () => useAppChatStore.getState().TTSEngine;

export const useASREngine = (): [ASREngineKey, (ASREngine: ASREngineKey) => void] =>
useAppChatStore(useShallow(state => [state.ASREngine, state.setASREngine]));

Expand Down
16 changes: 6 additions & 10 deletions src/apps/settings-modal/SettingsModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import WarningRoundedIcon from '@mui/icons-material/WarningRounded';

import { BrowseSettings } from '~/modules/browse/BrowseSettings';
import { DallESettings } from '~/modules/t2i/dalle/DallESettings';
import { ElevenlabsSettings } from '~/modules/elevenlabs/ElevenlabsSettings';
import { GoogleSearchSettings } from '~/modules/google/GoogleSearchSettings';
import { ProdiaSettings } from '~/modules/t2i/prodia/ProdiaSettings';
import { T2ISettings } from '~/modules/t2i/T2ISettings';
Expand All @@ -22,9 +21,9 @@ import { AppChatSettingsAI } from './AppChatSettingsAI';
import { AppChatSettingsUI } from './settings-ui/AppChatSettingsUI';
import { UxLabsSettings } from './UxLabsSettings';
import { VoiceSettings } from './VoiceSettings';
import { BrowserSpeechSettings } from '~/modules/browser/speech-synthesis/BrowserSpeechSettings';

import { useTTSEngine } from 'src/apps/chat/store-app-chat';
import { useTTSEngine } from '~/modules/tts/useTTSStore';
import { TTSSetting } from '~/modules/tts/tts.setting';
import { getName as getTTSEngineName } from '~/modules/tts/tts.client';


// styled <AccordionGroup variant='plain'> into a Topics component
Expand Down Expand Up @@ -198,12 +197,9 @@ export function SettingsModal(props: {
<Topic icon='🎙️' title='Voice settings'>
<VoiceSettings />
</Topic>
{TTSEngine === 'elevenlabs' && <Topic icon='📢' title='ElevenLabs API'>
<ElevenlabsSettings />
</Topic>}
{TTSEngine === 'webspeech' && <Topic icon='📢' title='Web Speech API'>
<BrowserSpeechSettings />
</Topic>}
<Topic icon='📢' title={getTTSEngineName()}>
<TTSSetting />
</Topic>
</Topics>
</TabPanel>

Expand Down
74 changes: 55 additions & 19 deletions src/apps/settings-modal/VoiceSettings.tsx
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import * as React from 'react';

import { FormControl } from '@mui/joy';
import { FormControl, Option, Select } from '@mui/joy';
import KeyboardArrowDownIcon from '@mui/icons-material/KeyboardArrowDown';

import { useASREngine, useChatAutoAI, useChatMicTimeoutMs, useTTSEngine } from '../chat/store-app-chat';
import { useASREngine, useChatAutoAI, useChatMicTimeoutMs } from '../chat/store-app-chat';

import { FormLabelStart } from '~/common/components/forms/FormLabelStart';
import { FormRadioControl } from '~/common/components/forms/FormRadioControl';
import { LanguageSelect } from '~/common/components/LanguageSelect';
import { useIsMobile } from '~/common/components/useMatchMedia';
import { hasVoices, ASREngineList, TTSEngineList, TTSEngineKey } from '~/common/components/useVoiceCapabilities';
import { ASREngineKey, ASREngineList } from '~/modules/asr/asr.client';
import { TTSEngineKey, TTSEngineList, useTTSEngine } from '~/modules/tts/useTTSStore';
import { useTTSCapability } from '~/modules/tts/tts.client.hooks';

export function VoiceSettings() {
// external state
Expand All @@ -23,6 +26,18 @@ export function VoiceSettings() {
const chatTimeoutValue: string = '' + chatTimeoutMs;
const setChatTimeoutValue = (value: string) => value && setChatTimeoutMs(parseInt(value));

const { mayWork: hasVoices } = useTTSCapability();

const handleTTSChanged = (_event: any, newValue: TTSEngineKey | null) => {
if (!newValue) return;
setTTSEngine(newValue);
};

const handleASRChanged = (_event: any, newValue: ASREngineKey | null) => {
if (!newValue) return;
setASREngine(newValue);
};

return (
<>
{/* LanguageSelect: moved from the UI settings (where it logically belongs), just to group things better from an UX perspective */}
Expand Down Expand Up @@ -63,23 +78,44 @@ export function VoiceSettings() {
onChange={setAutoSpeak}
/>

<FormRadioControl
title="TTS engine"
description="Text to speech"
tooltip=""
options={TTSEngineList.map((i) => ({ value: i.key, label: i.label }))}
value={TTSEngine}
onChange={setTTSEngine}
/>
<FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
<FormLabelStart title="TTS engine" description="Text to speech / voice synthesis" tooltip="" />

<FormRadioControl
title="ASR engine"
description="Automatic Speech Recognition"
tooltip=""
options={ASREngineList.map((i) => ({ value: i.key, label: i.label }))}
value={ASREngine}
onChange={setASREngine}
/>
<Select
value={TTSEngine}
onChange={handleTTSChanged}
indicator={<KeyboardArrowDownIcon />}
slotProps={{
root: { sx: { minWidth: 200 } },
indicator: { sx: { opacity: 0.5 } },
}}
>
{TTSEngineList.map((i) => (
<Option key={i.key} value={i.key}>
{i.label}
</Option>
))}
</Select>
</FormControl>

<FormControl orientation="horizontal" sx={{ justifyContent: 'space-between', alignItems: 'center' }}>
<FormLabelStart title="ASR engine" description="Automatic Speech Recognition" tooltip="" />
<Select
value={ASREngine}
onChange={handleASRChanged}
indicator={<KeyboardArrowDownIcon />}
slotProps={{
root: { sx: { minWidth: 200 } },
indicator: { sx: { opacity: 0.5 } },
}}
>
{ASREngineList.map((i) => (
<Option key={i.key} value={i.key}>
{i.label}
</Option>
))}
</Select>
</FormControl>
</>
);
}
12 changes: 0 additions & 12 deletions src/common/components/useCapabilities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,6 @@ export interface CapabilityBrowserSpeechRecognition {

export { browserSpeechRecognitionCapability as useCapabilityBrowserSpeechRecognition } from './speechrecognition/useSpeechRecognition';


/// Speech Synthesis

export interface CapabilitySpeechSynthesis {
mayWork: boolean;
isConfiguredServerSide: boolean;
isConfiguredClientSide: boolean;
}

export { useCapability as useVoiceCapability } from '~/common/components/useVoiceCapabilities';


/// Image Generation

export interface TextToImageProvider {
Expand Down
Loading