diff --git a/app/package.json b/app/package.json index 2b690c25..2b4850fb 100644 --- a/app/package.json +++ b/app/package.json @@ -4,6 +4,7 @@ "dependencies": { "@auth0/auth0-spa-js": "^2.0.4", "@emotion/css": "^11.10.6", + "@emotion/react": "^11.10.6", "@emotion/styled": "^11.10.6", "@mantine/core": "^5.10.5", "@mantine/hooks": "^5.10.5", @@ -20,6 +21,7 @@ "jshashes": "^1.0.8", "localforage": "^1.10.0", "match-sorter": "^6.3.1", + "mic-recorder-to-mp3": "^2.2.2", "minisearch": "^6.0.1", "natural": "^6.2.0", "openai": "^3.2.1", diff --git a/app/src/components/input.tsx b/app/src/components/input.tsx index 18da4a10..2e440b6d 100644 --- a/app/src/components/input.tsx +++ b/app/src/components/input.tsx @@ -1,7 +1,7 @@ import styled from '@emotion/styled'; import { Button, ActionIcon, Textarea, Loader } from '@mantine/core'; import { useMediaQuery } from '@mantine/hooks'; -import { useCallback, useMemo } from 'react'; +import { useCallback, useMemo, useState } from 'react'; import { FormattedMessage, useIntl } from 'react-intl'; import { useLocation } from 'react-router-dom'; import { useAppContext } from '../context'; @@ -9,6 +9,10 @@ import { useAppDispatch, useAppSelector } from '../store'; import { selectMessage, setMessage } from '../store/message'; import { selectTemperature } from '../store/parameters'; import { openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui'; +import { speechRecognition } from '../speech-recognition-types.d' +import MicRecorder from 'mic-recorder-to-mp3'; +import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys'; +import { Mp3Encoder } from 'lamejs'; const Container = styled.div` background: #292933; @@ -34,12 +38,54 @@ export interface MessageInputProps { disabled?: boolean; } + + +async function chunkAndEncodeMP3File(file: Blob): Promise> { + const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB + const audioContext = new AudioContext(); + const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer()); + const duration = audioBuffer.duration; + const sampleRate = audioBuffer.sampleRate; + const numChannels = audioBuffer.numberOfChannels; + const bytesPerSample = 2; // 16-bit audio + const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels); + const totalSamples = Math.floor(duration * sampleRate); + const numChunks = Math.ceil(totalSamples / samplesPerChunk); + + const chunks: Array = []; + for (let i = 0; i < numChunks; i++) { + const startSample = i * samplesPerChunk; + const endSample = Math.min(startSample + samplesPerChunk, totalSamples); + const chunkDuration = (endSample - startSample) / sampleRate; + const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate); + for (let c = 0; c < numChannels; c++) { + const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample); + chunkBuffer.copyToChannel(channelData, c); + } + const chunkBlob = await new Promise((resolve) => { + const encoder = new Mp3Encoder(numChannels, sampleRate, 128); + const leftData = chunkBuffer.getChannelData(0); + const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1); + const mp3Data = encoder.encodeBuffer(leftData, rightData); + const blob = new Blob([mp3Data], { type: 'audio/mp3' }); + resolve(blob); + }); + chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' })); + } + + return chunks; +} + + export default function MessageInput(props: MessageInputProps) { const temperature = useAppSelector(selectTemperature); const message = useAppSelector(selectMessage); - + const [recording, setRecording] = useState(false); const hasVerticalSpace = useMediaQuery('(min-height: 1000px)'); - + const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []); + const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper); + const openAIApiKey = useAppSelector(selectOpenAIApiKey); + const context = useAppContext(); const dispatch = useAppDispatch(); const intl = useIntl(); @@ -58,6 +104,69 @@ export default function MessageInput(props: MessageInputProps) { } }, [context, message, dispatch]); + const onSpeechStart = useCallback(() => { + + if (!recording) { + setRecording(true); + + // if we are using whisper, the we will just record with the browser and send the api when done + if (useOpenAIWhisper) { + recorder.start().catch((e: any) => console.error(e)); + } else { + speechRecognition.continuous = true; + speechRecognition.interimResults = true; + + speechRecognition.onresult = (event) => { + const transcript = event.results[event.results.length - 1][0].transcript; + dispatch(setMessage(transcript)); + }; + + speechRecognition.start(); + } + } else { + setRecording(false); + if (useOpenAIWhisper) { + const mp3 = recorder.stop().getMp3(); + + mp3.then(async ([buffer, blob]) => { + + const file = new File(buffer, 'chat.mp3', { + type: blob.type, + lastModified: Date.now() + }); + + // TODO: cut in chunks + + var data = new FormData() + data.append('file', file); + data.append('model', 'whisper-1') + + try { + const response = await fetch("https://api.openai.com/v1/audio/transcriptions", { + method: "POST", + headers: { + 'Authorization': `Bearer ${openAIApiKey}`, + }, + body: data, + }); + + const json = await response.json() + + if (json.text) { + dispatch(setMessage(json.text)); + } + } catch (e) { + console.log(e) + } + + }).catch((e: any) => console.error(e)); + } else { + speechRecognition.stop(); + } + } + }, [recording, message, dispatch]); + + const onKeyDown = useCallback((e: React.KeyboardEvent) => { if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) { e.preventDefault(); @@ -66,6 +175,7 @@ export default function MessageInput(props: MessageInputProps) { }, [onSubmit, props.disabled]); const rightSection = useMemo(() => { + return (
)} {!context.generating && ( - - - + <> + + + + + + + )}
); - }, [onSubmit, props.disabled, context.generating]); + }, [recording, onSubmit, props.disabled, context.generating]); const disabled = context.generating; diff --git a/app/src/components/message.tsx b/app/src/components/message.tsx index de77ae1a..de14c6f0 100644 --- a/app/src/components/message.tsx +++ b/app/src/components/message.tsx @@ -244,7 +244,7 @@ export default function MessageComponent(props: { message: Message, last: boolea )} @@ -263,7 +263,7 @@ export default function MessageComponent(props: { message: Message, last: boolea }}> - {editing ? + {editing ? : } diff --git a/app/src/components/settings/user.tsx b/app/src/components/settings/user.tsx index 9e217fa2..6b6d1303 100644 --- a/app/src/components/settings/user.tsx +++ b/app/src/components/settings/user.tsx @@ -1,24 +1,26 @@ import SettingsTab from "./tab"; import SettingsOption from "./option"; -import { TextInput } from "@mantine/core"; +import { Checkbox, TextInput } from "@mantine/core"; import { useCallback, useMemo } from "react"; import { useAppDispatch, useAppSelector } from "../../store"; -import { selectOpenAIApiKey, setOpenAIApiKeyFromEvent } from "../../store/api-keys"; +import { selectOpenAIApiKey, setOpenAIApiKeyFromEvent, selectUseOpenAIWhisper, setUseOpenAIWhisperFromEvent } from "../../store/api-keys"; import { selectSettingsOption } from "../../store/settings-ui"; import { FormattedMessage, useIntl } from "react-intl"; export default function UserOptionsTab(props: any) { const option = useAppSelector(selectSettingsOption); const openaiApiKey = useAppSelector(selectOpenAIApiKey); + const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper); const intl = useIntl() const dispatch = useAppDispatch(); const onOpenAIApiKeyChange = useCallback((event: React.ChangeEvent) => dispatch(setOpenAIApiKeyFromEvent(event)), [dispatch]); + const onUseOpenAIWhisperChange = useCallback((event: React.ChangeEvent) => dispatch(setUseOpenAIWhisperFromEvent(event)), [dispatch]); const elem = useMemo(() => ( + focused={option === 'openai-api-key'}>

+ + +

@@ -36,7 +45,7 @@ export default function UserOptionsTab(props: any) {

- ), [option, openaiApiKey, onOpenAIApiKeyChange]); + ), [option, openaiApiKey, useOpenAIWhisper, onOpenAIApiKeyChange]); return elem; } \ No newline at end of file diff --git a/app/src/index.tsx b/app/src/index.tsx index 877a99f9..209c75b6 100644 --- a/app/src/index.tsx +++ b/app/src/index.tsx @@ -72,7 +72,7 @@ async function bootstrapApplication() { root.render( - + diff --git a/app/src/openai.ts b/app/src/openai.ts index 07e6afcd..0e9eb9c7 100644 --- a/app/src/openai.ts +++ b/app/src/openai.ts @@ -26,7 +26,7 @@ export interface OpenAIResponseChunk { function parseResponseChunk(buffer: any): OpenAIResponseChunk { const chunk = buffer.toString().replace('data: ', '').trim(); - + if (chunk === '[DONE]') { return { done: true, @@ -51,7 +51,7 @@ export async function createChatCompletion(messages: OpenAIMessage[], parameters const configuration = new Configuration({ apiKey: parameters.apiKey, }); - + const openai = new OpenAIApi(configuration); const response = await openai.createChatCompletion({ @@ -131,6 +131,7 @@ export async function createStreamingChatCompletion(messages: OpenAIMessage[], p }); eventSource.addEventListener('message', async (event: any) => { + if (event.data === '[DONE]') { emitter.emit('done'); return; @@ -149,7 +150,7 @@ export async function createStreamingChatCompletion(messages: OpenAIMessage[], p eventSource.stream(); - return { + return { emitter, cancel: () => eventSource.close(), }; diff --git a/app/src/speech-recognition-types.d.ts b/app/src/speech-recognition-types.d.ts new file mode 100644 index 00000000..8e205a5b --- /dev/null +++ b/app/src/speech-recognition-types.d.ts @@ -0,0 +1,133 @@ +declare global { + interface Window { + SpeechRecognition: SpeechRecognition + } + interface SpeechGrammar { + src: string + weight: number + } + + const SpeechGrammar: { + prototype: SpeechGrammar + new(): SpeechGrammar + } + + interface SpeechGrammarList { + readonly length: number + addFromString(string: string, weight?: number): void + addFromURI(src: string, weight?: number): void + item(index: number): SpeechGrammar + [index: number]: SpeechGrammar + } + + const SpeechGrammarList: { + prototype: SpeechGrammarList + new(): SpeechGrammarList + } + + interface SpeechRecognitionEventMap { + audioend: Event + audiostart: Event + end: Event + error: SpeechRecognitionError + nomatch: SpeechRecognitionEvent + result: SpeechRecognitionEvent + soundend: Event + soundstart: Event + speechend: Event + speechstart: Event + start: Event + } + + interface SpeechRecognition { + continuous: boolean + grammars: SpeechGrammarList + interimResults: boolean + lang: string + maxAlternatives: number + onaudioend: ((this: SpeechRecognition, ev: Event) => any) | null + onaudiostart: ((this: SpeechRecognition, ev: Event) => any) | null + onend: ((this: SpeechRecognition, ev: Event) => any) | null + onerror: + | ((this: SpeechRecognition, ev: SpeechRecognitionError) => any) + | null + onnomatch: + | ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any) + | null + onresult: + | ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any) + | null + onsoundend: ((this: SpeechRecognition, ev: Event) => any) | null + onsoundstart: ((this: SpeechRecognition, ev: Event) => any) | null + onspeechend: ((this: SpeechRecognition, ev: Event) => any) | null + onspeechstart: ((this: SpeechRecognition, ev: Event) => any) | null + onstart: ((this: SpeechRecognition, ev: Event) => any) | null + serviceURI: string + abort(): void + start(): void + stop(): void + addEventListener( + type: K, + listener: ( + this: SpeechRecognition, + ev: SpeechRecognitionEventMap[K] + ) => any, + options?: boolean | AddEventListenerOptions + ): void + addEventListener( + type: string, + listener: EventListenerOrEventListenerObject, + options?: boolean | AddEventListenerOptions + ): void + removeEventListener( + type: K, + listener: ( + this: SpeechRecognition, + ev: SpeechRecognitionEventMap[K] + ) => any, + options?: boolean | EventListenerOptions + ): void + removeEventListener( + type: string, + listener: EventListenerOrEventListenerObject, + options?: boolean | EventListenerOptions + ): void + } + + const SpeechRecognition: { + prototype: SpeechRecognition + new(): SpeechRecognition + } + + interface SpeechRecognitionError extends Event { + // readonly error: SpeechRecognitionErrorCode; + readonly message: string + } + + const SpeechRecognitionError: { + prototype: SpeechRecognitionError + new(): SpeechRecognitionError + } + + interface SpeechRecognitionEvent extends Event { + readonly emma: Document | null + readonly interpretation: any + readonly resultIndex: number + readonly results: SpeechRecognitionResultList + } + + const SpeechRecognitionEvent: { + prototype: SpeechRecognitionEvent + new(): SpeechRecognitionEvent + } +} + +let speechRecognition: SpeechRecognition + +if (window.SpeechRecognition) { + speechRecognition = new SpeechRecognition() +} else { + speechRecognition = new webkitSpeechRecognition() +} + +export { speechRecognition } \ No newline at end of file diff --git a/app/src/store/api-keys.ts b/app/src/store/api-keys.ts index 59a86344..eced56c4 100644 --- a/app/src/store/api-keys.ts +++ b/app/src/store/api-keys.ts @@ -3,9 +3,12 @@ import type { RootState } from '.'; const initialState: { openAIApiKey?: string | null | undefined; + useOpenAIWhisper: boolean; elevenLabsApiKey?: string | null | undefined; + } = { openAIApiKey: localStorage.getItem('openai-api-key'), + useOpenAIWhisper: false, elevenLabsApiKey: localStorage.getItem('elevenlabs-api-key'), }; @@ -18,7 +21,11 @@ export const apiKeysSlice = createSlice({ }, setElevenLabsApiKey: (state, action: PayloadAction) => { state.elevenLabsApiKey = action.payload; + }, + setUseOpenAIWhisper: (state, action: PayloadAction) => { + state.useOpenAIWhisper = action.payload; } + }, }) @@ -26,8 +33,10 @@ export const { setOpenAIApiKey, setElevenLabsApiKey } = apiKeysSlice.actions; export const setOpenAIApiKeyFromEvent = (event: React.ChangeEvent) => apiKeysSlice.actions.setOpenAIApiKey(event.target.value); export const setElevenLabsApiKeyFromEvent = (event: React.ChangeEvent) => apiKeysSlice.actions.setElevenLabsApiKey(event.target.value); +export const setUseOpenAIWhisperFromEvent = (event: React.ChangeEvent) => apiKeysSlice.actions.setUseOpenAIWhisper(event.target.checked); export const selectOpenAIApiKey = (state: RootState) => state.apiKeys.openAIApiKey; export const selectElevenLabsApiKey = (state: RootState) => state.apiKeys.elevenLabsApiKey; +export const selectUseOpenAIWhisper = (state: RootState) => state.apiKeys.useOpenAIWhisper; export default apiKeysSlice.reducer; \ No newline at end of file diff --git a/app/src/store/index.ts b/app/src/store/index.ts index fa5be686..be1fdde8 100644 --- a/app/src/store/index.ts +++ b/app/src/store/index.ts @@ -25,6 +25,8 @@ const persistMessageConfig = { storage, } + + const store = configureStore({ reducer: { // auth: authReducer, diff --git a/server/src/endpoints/whisper.ts b/server/src/endpoints/whisper.ts new file mode 100644 index 00000000..94a81671 --- /dev/null +++ b/server/src/endpoints/whisper.ts @@ -0,0 +1,8 @@ +import express from 'express'; +import RequestHandler from "./base"; + +export default class WhisperRequestHandler extends RequestHandler { + handler(req: express.Request, res: express.Response): any { + res.json({ status: 'ok' }); + } +} \ No newline at end of file diff --git a/server/src/index.ts b/server/src/index.ts index d8328499..1720daf8 100644 --- a/server/src/index.ts +++ b/server/src/index.ts @@ -18,6 +18,7 @@ import BasicCompletionRequestHandler from './endpoints/completion/basic'; import StreamingCompletionRequestHandler from './endpoints/completion/streaming'; import SessionRequestHandler from './endpoints/session'; import GetShareRequestHandler from './endpoints/get-share'; +import WhisperRequestHandler from './endpoints/whisper'; import { configurePassport } from './passport'; import { configureAuth0 } from './auth0'; import DeleteChatRequestHandler from './endpoints/delete-chat'; @@ -82,6 +83,7 @@ export default class ChatServer { this.app.post('/chatapi/sync', (req, res) => new SyncRequestHandler(this, req, res)); this.app.get('/chatapi/share/:id', (req, res) => new GetShareRequestHandler(this, req, res)); this.app.post('/chatapi/share', (req, res) => new ShareRequestHandler(this, req, res)); + this.app.post('/chatapi/whisper', (req, res) => new WhisperRequestHandler(this, req, res)); if (process.env.ENABLE_SERVER_COMPLETION) { this.app.post('/chatapi/completion', (req, res) => new BasicCompletionRequestHandler(this, req, res));