Merge pull request cogentapps#58 from tluyben/whisper-stt-api

Whisper stt api
allcommerce · Mar 20, 2023 · 95bf0ae · 95bf0ae
2 parents b6881c6 + 39e175b
commit 95bf0ae
Show file tree

Hide file tree

Showing 11 changed files with 300 additions and 18 deletions.
diff --git a/app/package.json b/app/package.json
@@ -4,6 +4,7 @@
   "dependencies": {
     "@auth0/auth0-spa-js": "^2.0.4",
     "@emotion/css": "^11.10.6",
+    "@emotion/react": "^11.10.6",
     "@emotion/styled": "^11.10.6",
     "@mantine/core": "^5.10.5",
     "@mantine/hooks": "^5.10.5",
@@ -20,6 +21,7 @@
     "jshashes": "^1.0.8",
     "localforage": "^1.10.0",
     "match-sorter": "^6.3.1",
+    "mic-recorder-to-mp3": "^2.2.2",
     "minisearch": "^6.0.1",
     "natural": "^6.2.0",
     "openai": "^3.2.1",

diff --git a/app/src/components/input.tsx b/app/src/components/input.tsx
@@ -1,14 +1,18 @@
 import styled from '@emotion/styled';
 import { Button, ActionIcon, Textarea, Loader } from '@mantine/core';
 import { useMediaQuery } from '@mantine/hooks';
-import { useCallback, useMemo } from 'react';
+import { useCallback, useMemo, useState } from 'react';
 import { FormattedMessage, useIntl } from 'react-intl';
 import { useLocation } from 'react-router-dom';
 import { useAppContext } from '../context';
 import { useAppDispatch, useAppSelector } from '../store';
 import { selectMessage, setMessage } from '../store/message';
 import { selectTemperature } from '../store/parameters';
 import { openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui';
+import { speechRecognition } from '../speech-recognition-types.d'
+import MicRecorder from 'mic-recorder-to-mp3';
+import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys';
+import { Mp3Encoder } from 'lamejs';
 
 const Container = styled.div`
     background: #292933;
@@ -34,12 +38,54 @@ export interface MessageInputProps {
     disabled?: boolean;
 }
 
+
+
+async function chunkAndEncodeMP3File(file: Blob): Promise<Array<File>> {
+    const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB
+    const audioContext = new AudioContext();
+    const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer());
+    const duration = audioBuffer.duration;
+    const sampleRate = audioBuffer.sampleRate;
+    const numChannels = audioBuffer.numberOfChannels;
+    const bytesPerSample = 2; // 16-bit audio
+    const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels);
+    const totalSamples = Math.floor(duration * sampleRate);
+    const numChunks = Math.ceil(totalSamples / samplesPerChunk);
+
+    const chunks: Array<File> = [];
+    for (let i = 0; i < numChunks; i++) {
+        const startSample = i * samplesPerChunk;
+        const endSample = Math.min(startSample + samplesPerChunk, totalSamples);
+        const chunkDuration = (endSample - startSample) / sampleRate;
+        const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate);
+        for (let c = 0; c < numChannels; c++) {
+            const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample);
+            chunkBuffer.copyToChannel(channelData, c);
+        }
+        const chunkBlob = await new Promise<Blob>((resolve) => {
+            const encoder = new Mp3Encoder(numChannels, sampleRate, 128);
+            const leftData = chunkBuffer.getChannelData(0);
+            const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1);
+            const mp3Data = encoder.encodeBuffer(leftData, rightData);
+            const blob = new Blob([mp3Data], { type: 'audio/mp3' });
+            resolve(blob);
+        });
+        chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' }));
+    }
+
+    return chunks;
+}
+
+
 export default function MessageInput(props: MessageInputProps) {
     const temperature = useAppSelector(selectTemperature);
     const message = useAppSelector(selectMessage);
-
+    const [recording, setRecording] = useState(false);
     const hasVerticalSpace = useMediaQuery('(min-height: 1000px)');
-
+    const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []);
+    const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
+    const openAIApiKey = useAppSelector(selectOpenAIApiKey);
+
     const context = useAppContext();
     const dispatch = useAppDispatch();
     const intl = useIntl();
@@ -58,6 +104,69 @@ export default function MessageInput(props: MessageInputProps) {
         }
     }, [context, message, dispatch]);
 
+    const onSpeechStart = useCallback(() => {
+
+        if (!recording) {
+            setRecording(true);
+
+            // if we are using whisper, the we will just record with the browser and send the api when done 
+            if (useOpenAIWhisper) {
+                recorder.start().catch((e: any) => console.error(e));
+            } else {
+                speechRecognition.continuous = true;
+                speechRecognition.interimResults = true;
+
+                speechRecognition.onresult = (event) => {
+                    const transcript = event.results[event.results.length - 1][0].transcript;
+                    dispatch(setMessage(transcript));
+                };
+
+                speechRecognition.start();
+            }
+        } else {
+            setRecording(false);
+            if (useOpenAIWhisper) {
+                const mp3 = recorder.stop().getMp3();
+
+                mp3.then(async ([buffer, blob]) => {
+
+                    const file = new File(buffer, 'chat.mp3', {
+                        type: blob.type,
+                        lastModified: Date.now()
+                    });
+
+                    // TODO: cut in chunks
+
+                    var data = new FormData()
+                    data.append('file', file);
+                    data.append('model', 'whisper-1')
+
+                    try {
+                        const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
+                            method: "POST",
+                            headers: {
+                                'Authorization': `Bearer ${openAIApiKey}`,
+                            },
+                            body: data,
+                        });
+
+                        const json = await response.json()
+
+                        if (json.text) {
+                            dispatch(setMessage(json.text));
+                        }
+                    } catch (e) {
+                        console.log(e)
+                    }
+
+                }).catch((e: any) => console.error(e));
+            } else {
+                speechRecognition.stop();
+            }
+        }
+    }, [recording, message, dispatch]);
+
+
     const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => {
         if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) {
             e.preventDefault();
@@ -66,6 +175,7 @@ export default function MessageInput(props: MessageInputProps) {
     }, [onSubmit, props.disabled]);
 
     const rightSection = useMemo(() => {
+
         return (
             <div style={{
                 opacity: '0.8',
@@ -84,14 +194,20 @@ export default function MessageInput(props: MessageInputProps) {
                     <Loader size="xs" style={{ padding: '0 0.8rem 0 0.5rem' }} />
                 </>)}
                 {!context.generating && (
-                    <ActionIcon size="xl"
-                        onClick={onSubmit}>
-                        <i className="fa fa-paper-plane" style={{ fontSize: '90%' }} />
-                    </ActionIcon>
+                    <>
+                        <ActionIcon size="xl"
+                            onClick={onSpeechStart}>
+                            <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />
+                        </ActionIcon>
+                        <ActionIcon size="xl"
+                            onClick={onSubmit}>
+                            <i className="fa fa-paper-plane" style={{ fontSize: '90%' }} />
+                        </ActionIcon>
+                    </>
                 )}
             </div>
         );
-    }, [onSubmit, props.disabled, context.generating]);
+    }, [recording, onSubmit, props.disabled, context.generating]);
 
     const disabled = context.generating;
 

diff --git a/app/src/components/message.tsx b/app/src/components/message.tsx
@@ -244,7 +244,7 @@ export default function MessageComponent(props: { message: Message, last: boolea
                                 <Button variant="subtle" size="sm" compact onClick={copy} style={{ marginLeft: '1rem' }}>
                                     <i className="fa fa-clipboard" />
                                     {copied ? <FormattedMessage defaultMessage="Copied" description="Label for copy-to-clipboard button after a successful copy" />
-                                            : <FormattedMessage defaultMessage="Copy" description="Label for copy-to-clipboard button" />}
+                                        : <FormattedMessage defaultMessage="Copy" description="Label for copy-to-clipboard button" />}
                                 </Button>
                             )}
                         </CopyButton>
@@ -263,7 +263,7 @@ export default function MessageComponent(props: { message: Message, last: boolea
                             }}>
                                 <i className="fa fa-edit" />
                                 <span>
-                                    {editing ? <FormattedMessage defaultMessage="Cancel" description="Label for a button that appears when the user is editing the text of one of their messages, to cancel without saving changes" /> 
+                                    {editing ? <FormattedMessage defaultMessage="Cancel" description="Label for a button that appears when the user is editing the text of one of their messages, to cancel without saving changes" />
                                         : <FormattedMessage defaultMessage="Edit" description="Label for the button the user can click to edit the text of one of their messages" />}
                                 </span>
                             </Button>

diff --git a/app/src/components/settings/user.tsx b/app/src/components/settings/user.tsx
@@ -1,24 +1,26 @@
 import SettingsTab from "./tab";
 import SettingsOption from "./option";
-import { TextInput } from "@mantine/core";
+import { Checkbox, TextInput } from "@mantine/core";
 import { useCallback, useMemo } from "react";
 import { useAppDispatch, useAppSelector } from "../../store";
-import { selectOpenAIApiKey, setOpenAIApiKeyFromEvent } from "../../store/api-keys";
+import { selectOpenAIApiKey, setOpenAIApiKeyFromEvent, selectUseOpenAIWhisper, setUseOpenAIWhisperFromEvent } from "../../store/api-keys";
 import { selectSettingsOption } from "../../store/settings-ui";
 import { FormattedMessage, useIntl } from "react-intl";
 
 export default function UserOptionsTab(props: any) {
     const option = useAppSelector(selectSettingsOption);
     const openaiApiKey = useAppSelector(selectOpenAIApiKey);
+    const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
     const intl = useIntl()
 
     const dispatch = useAppDispatch();
     const onOpenAIApiKeyChange = useCallback((event: React.ChangeEvent<HTMLInputElement>) => dispatch(setOpenAIApiKeyFromEvent(event)), [dispatch]);
+    const onUseOpenAIWhisperChange = useCallback((event: React.ChangeEvent<HTMLInputElement>) => dispatch(setUseOpenAIWhisperFromEvent(event)), [dispatch]);
 
     const elem = useMemo(() => (
         <SettingsTab name="user">
             <SettingsOption heading={intl.formatMessage({ defaultMessage: "Your OpenAI API Key", description: "Heading for the OpenAI API key setting on the settings screen" })}
-                            focused={option === 'openai-api-key'}>
+                focused={option === 'openai-api-key'}>
                 <TextInput
                     placeholder={intl.formatMessage({ defaultMessage: "Paste your API key here" })}
                     value={openaiApiKey || ''}
@@ -28,6 +30,13 @@ export default function UserOptionsTab(props: any) {
                         <FormattedMessage defaultMessage="Find your API key here." description="Label for the link that takes the user to the page on the OpenAI website where they can find their API key." />
                     </a>
                 </p>
+
+                <Checkbox
+                    style={{ marginTop: '1rem' }}
+                    id="use-openai-whisper-api" checked={useOpenAIWhisper!} onChange={onUseOpenAIWhisperChange}
+                    label="Use the OpenAI Whisper API for speech recognition."
+                />
+
                 <p>
                     <FormattedMessage defaultMessage="Your API key is stored only on this device and never transmitted to anyone except OpenAI." />
                 </p>
@@ -36,7 +45,7 @@ export default function UserOptionsTab(props: any) {
                 </p>
             </SettingsOption>
         </SettingsTab>
-    ), [option, openaiApiKey, onOpenAIApiKeyChange]);
+    ), [option, openaiApiKey, useOpenAIWhisper, onOpenAIApiKeyChange]);
 
     return elem;
 }
diff --git a/app/src/index.tsx b/app/src/index.tsx
@@ -72,7 +72,7 @@ async function bootstrapApplication() {
 
     root.render(
         <React.StrictMode>
-            <IntlProvider locale={navigator.language} messages={messages}>
+            <IntlProvider locale={navigator.language} defaultLocale="en-GB" messages={messages}>
                 <MantineProvider theme={{ colorScheme: "dark" }}>
                     <Provider store={store}>
                         <PersistGate loading={null} persistor={persistor}>

diff --git a/app/src/openai.ts b/app/src/openai.ts
@@ -26,7 +26,7 @@ export interface OpenAIResponseChunk {
 
 function parseResponseChunk(buffer: any): OpenAIResponseChunk {
     const chunk = buffer.toString().replace('data: ', '').trim();
-    
+
     if (chunk === '[DONE]') {
         return {
             done: true,
@@ -51,7 +51,7 @@ export async function createChatCompletion(messages: OpenAIMessage[], parameters
     const configuration = new Configuration({
         apiKey: parameters.apiKey,
     });
-    
+
     const openai = new OpenAIApi(configuration);
 
     const response = await openai.createChatCompletion({
@@ -131,6 +131,7 @@ export async function createStreamingChatCompletion(messages: OpenAIMessage[], p
     });
 
     eventSource.addEventListener('message', async (event: any) => {
+
         if (event.data === '[DONE]') {
             emitter.emit('done');
             return;
@@ -149,7 +150,7 @@ export async function createStreamingChatCompletion(messages: OpenAIMessage[], p
 
     eventSource.stream();
 
-    return { 
+    return {
         emitter,
         cancel: () => eventSource.close(),
     };