Skip to content

Commit

Permalink
speech recognition error handling & browser fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
cogentapps committed Mar 21, 2023
1 parent 9e10796 commit d7251e3
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 98 deletions.
1 change: 1 addition & 0 deletions app/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"version": "0.2.1",
"dependencies": {
"@auth0/auth0-spa-js": "^2.0.4",
"@chengsokdara/use-whisper": "^0.2.0",
"@emotion/css": "^11.10.6",
"@emotion/react": "^11.10.6",
"@emotion/styled": "^11.10.6",
Expand Down
193 changes: 95 additions & 98 deletions app/src/components/input.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import styled from '@emotion/styled';
import { Button, ActionIcon, Textarea, Loader } from '@mantine/core';
import { Button, ActionIcon, Textarea, Loader, Popover } from '@mantine/core';
import { useMediaQuery } from '@mantine/hooks';
import { useCallback, useMemo, useState } from 'react';
import { useCallback, useEffect, useMemo, useState } from 'react';
import { FormattedMessage, useIntl } from 'react-intl';
import { useLocation } from 'react-router-dom';
import { useAppContext } from '../context';
Expand All @@ -10,9 +10,8 @@ import { selectMessage, setMessage } from '../store/message';
import { selectTemperature } from '../store/parameters';
import { openOpenAIApiKeyPanel, openSystemPromptPanel, openTemperaturePanel } from '../store/settings-ui';
import { speechRecognition, supportsSpeechRecognition } from '../speech-recognition-types'
import MicRecorder from 'mic-recorder-to-mp3';
import { useWhisper } from '@chengsokdara/use-whisper';
import { selectUseOpenAIWhisper, selectOpenAIApiKey } from '../store/api-keys';
import { Mp3Encoder } from 'lamejs';

const Container = styled.div`
background: #292933;
Expand All @@ -38,55 +37,26 @@ export interface MessageInputProps {
disabled?: boolean;
}



async function chunkAndEncodeMP3File(file: Blob): Promise<Array<File>> {
const MAX_CHUNK_SIZE = 25 * 1024 * 1024; // 25 MB
const audioContext = new AudioContext();
const audioBuffer = await audioContext.decodeAudioData(await file.arrayBuffer());
const duration = audioBuffer.duration;
const sampleRate = audioBuffer.sampleRate;
const numChannels = audioBuffer.numberOfChannels;
const bytesPerSample = 2; // 16-bit audio
const samplesPerChunk = Math.floor((MAX_CHUNK_SIZE / bytesPerSample) / numChannels);
const totalSamples = Math.floor(duration * sampleRate);
const numChunks = Math.ceil(totalSamples / samplesPerChunk);

const chunks: Array<File> = [];
for (let i = 0; i < numChunks; i++) {
const startSample = i * samplesPerChunk;
const endSample = Math.min(startSample + samplesPerChunk, totalSamples);
const chunkDuration = (endSample - startSample) / sampleRate;
const chunkBuffer = audioContext.createBuffer(numChannels, endSample - startSample, sampleRate);
for (let c = 0; c < numChannels; c++) {
const channelData = audioBuffer.getChannelData(c).subarray(startSample, endSample);
chunkBuffer.copyToChannel(channelData, c);
}
const chunkBlob = await new Promise<Blob>((resolve) => {
const encoder = new Mp3Encoder(numChannels, sampleRate, 128);
const leftData = chunkBuffer.getChannelData(0);
const rightData = numChannels === 1 ? leftData : chunkBuffer.getChannelData(1);
const mp3Data = encoder.encodeBuffer(leftData, rightData);
const blob = new Blob([mp3Data], { type: 'audio/mp3' });
resolve(blob);
});
chunks.push(new File([chunkBlob], `text-${i}.mp3`, { type: 'audio/mp3' }));
}

return chunks;
}


export default function MessageInput(props: MessageInputProps) {
const temperature = useAppSelector(selectTemperature);
const message = useAppSelector(selectMessage);
const [recording, setRecording] = useState(false);
const [transcribing, setTranscribing] = useState(false);
const [speechError, setSpeechError] = useState<string | null>(null);
const hasVerticalSpace = useMediaQuery('(min-height: 1000px)');
const recorder = useMemo(() => new MicRecorder({ bitRate: 128 }), []);
const useOpenAIWhisper = useAppSelector(selectUseOpenAIWhisper);
const openAIApiKey = useAppSelector(selectOpenAIApiKey);

const [initialMessage, setInitialMessage] = useState('');
const {
transcribing,
transcript,
startRecording,
stopRecording,
} = useWhisper({
apiKey: openAIApiKey || ' ',
streaming: false,
});

const context = useAppContext();
const dispatch = useAppDispatch();
const intl = useIntl();
Expand All @@ -100,40 +70,71 @@ export default function MessageInput(props: MessageInputProps) {
const pathname = useLocation().pathname;

const onSubmit = useCallback(async () => {
setSpeechError(null);

if (await context.onNewMessage(message)) {
dispatch(setMessage(''));
}
}, [context, message, dispatch]);

const onSpeechError = useCallback((e: any) => {
console.error('speech recognition error', e);
setSpeechError(e.message);

try {
speechRecognition?.stop();
} catch (e) {
}

try {
recorder.stop();
stopRecording();
} catch (e) { }

setRecording(false);
setTranscribing(false);
}, [recorder]);
}, [stopRecording]);

const onSpeechStart = useCallback(() => {
if (!openAIApiKey) {
dispatch(openOpenAIApiKeyPanel());
return false;
const onHideSpeechError = useCallback(() => setSpeechError(null), []);

const onSpeechStart = useCallback(async () => {
let granted = false;
let denied = false;

try {
const result = await navigator.permissions.query({ name: 'microphone' as any });
if (result.state == 'granted') {
granted = true;
} else if (result.state == 'denied') {
denied = true;
}
} catch (e) {}

if (!granted && !denied) {
try {
const stream = await navigator.mediaDevices.getUserMedia({ video: false, audio: true });
stream.getTracks().forEach(track => track.stop());
granted = true;
} catch (e) {
denied = true;
}
}

if (denied) {
onSpeechError(new Error('speech permission was not granted'));
return;
}

try {
if (!recording) {
setRecording(true);

// if we are using whisper, the we will just record with the browser and send the api when done
if (useOpenAIWhisper || !supportsSpeechRecognition) {
recorder.start().catch(onSpeechError);
if (!openAIApiKey) {
dispatch(openOpenAIApiKeyPanel());
return false;
}
// recorder.start().catch(onSpeechError);
setInitialMessage(message);
await startRecording();
} else if (speechRecognition) {
const initialMessage = message;

Expand All @@ -155,54 +156,28 @@ export default function MessageInput(props: MessageInputProps) {
onSpeechError(new Error('not supported'));
}
} else {
setRecording(false);
if (useOpenAIWhisper || !supportsSpeechRecognition) {
setTranscribing(true);
const mp3 = recorder.stop().getMp3();

mp3.then(async ([buffer, blob]) => {
const file = new File(buffer, 'chat.mp3', {
type: blob.type,
lastModified: Date.now()
});

// TODO: cut in chunks

var data = new FormData()
data.append('file', file);
data.append('model', 'whisper-1')

try {
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
method: "POST",
headers: {
'Authorization': `Bearer ${openAIApiKey}`,
},
body: data,
});

const json = await response.json()

if (json.text) {
dispatch(setMessage(message + ' ' + json.text));
setTranscribing(false);
}
} catch (e) {
onSpeechError(e);
}

}).catch(onSpeechError);
await stopRecording();
setTimeout(() => setRecording(false), 500);
} else if (speechRecognition) {
speechRecognition.stop();
setRecording(false);
} else {
onSpeechError(new Error('not supported'));
}
}
} catch (e) {
onSpeechError(e);
}
}, [recording, message, dispatch, onSpeechError, openAIApiKey]);
}, [recording, message, dispatch, onSpeechError, setInitialMessage, openAIApiKey]);

useEffect(() => {
if (useOpenAIWhisper || !supportsSpeechRecognition) {
if (!transcribing && !recording && transcript?.text) {
dispatch(setMessage(initialMessage + ' ' + transcript.text));
}
}
}, [initialMessage, transcript, recording, transcribing, useOpenAIWhisper, dispatch]);

const onKeyDown = useCallback((e: React.KeyboardEvent<HTMLTextAreaElement>) => {
if (e.key === 'Enter' && e.shiftKey === false && !props.disabled) {
Expand All @@ -212,7 +187,6 @@ export default function MessageInput(props: MessageInputProps) {
}, [onSubmit, props.disabled]);

const rightSection = useMemo(() => {

return (
<div style={{
opacity: '0.8',
Expand All @@ -232,11 +206,34 @@ export default function MessageInput(props: MessageInputProps) {
</>)}
{!context.generating && (
<>
<ActionIcon size="xl"
onClick={onSpeechStart}>
{transcribing && <Loader size="xs" />}
{!transcribing && <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />}
</ActionIcon>
<Popover width={200} position="bottom" withArrow shadow="md" opened={speechError !== null}>
<Popover.Target>
<ActionIcon size="xl"
onClick={onSpeechStart}>
{transcribing && <Loader size="xs" />}
{!transcribing && <i className="fa fa-microphone" style={{ fontSize: '90%', color: recording ? 'red' : 'inherit' }} />}
</ActionIcon>
</Popover.Target>
<Popover.Dropdown>
<div style={{
display: 'flex',
flexDirection: 'column',
alignItems: 'flex-start',
}}>
<p style={{
fontFamily: `"Work Sans", sans-serif`,
fontSize: '0.9rem',
textAlign: 'center',
marginBottom: '0.5rem',
}}>
Sorry, an error occured trying to record audio.
</p>
<Button variant="light" size="xs" fullWidth onClick={onHideSpeechError}>
Close
</Button>
</div>
</Popover.Dropdown>
</Popover>
<ActionIcon size="xl"
onClick={onSubmit}>
<i className="fa fa-paper-plane" style={{ fontSize: '90%' }} />
Expand All @@ -245,7 +242,7 @@ export default function MessageInput(props: MessageInputProps) {
)}
</div>
);
}, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating]);
}, [recording, transcribing, onSubmit, onSpeechStart, props.disabled, context.generating, speechError, onHideSpeechError]);

const disabled = context.generating;

Expand Down

0 comments on commit d7251e3

Please sign in to comment.