diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts index 821f0221630..547804c5c3f 100644 --- a/packages/components/src/speechToText.ts +++ b/packages/components/src/speechToText.ts @@ -3,11 +3,13 @@ import { getCredentialData } from './utils' import { type ClientOptions, OpenAIClient, toFile } from '@langchain/openai' import { AssemblyAI } from 'assemblyai' import { getFileFromStorage } from './storageUtils' +import Groq from 'groq-sdk' const SpeechToTextType = { OPENAI_WHISPER: 'openAIWhisper', ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe', - LOCALAI_STT: 'localAISTT' + LOCALAI_STT: 'localAISTT', + GROQ_WHISPER: 'groqWhisper' } export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => { @@ -70,6 +72,23 @@ export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfi } break } + case SpeechToTextType.GROQ_WHISPER: { + const groqClient = new Groq({ + apiKey: credentialData.groqApiKey + }) + const file = await toFile(audio_file, upload.name) + const groqTranscription = await groqClient.audio.transcriptions.create({ + file, + model: speechToTextConfig?.model || 'whisper-large-v3', + language: speechToTextConfig?.language, + temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined, + response_format: 'verbose_json' + }) + if (groqTranscription?.text) { + return groqTranscription.text + } + break + } } } else { throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') diff --git a/packages/ui/src/assets/images/groq.png b/packages/ui/src/assets/images/groq.png new file mode 100644 index 00000000000..ea2b8821cf8 Binary files /dev/null and b/packages/ui/src/assets/images/groq.png differ diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx index 59f9964b918..17dc132039b 100644 --- a/packages/ui/src/ui-component/extended/SpeechToText.jsx +++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx @@ -17,6 +17,7 @@ import { Dropdown } from '@/ui-component/dropdown/Dropdown' import openAISVG from '@/assets/images/openai.svg' import assemblyAIPng from '@/assets/images/assemblyai.png' import localAiPng from '@/assets/images/localai.png' +import groqPng from '@/assets/images/groq.png' // store import useNotifier from '@/utils/useNotifier' @@ -29,7 +30,8 @@ import chatflowsApi from '@/api/chatflows' const SpeechToTextType = { OPENAI_WHISPER: 'openAIWhisper', ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe', - LOCALAI_STT: 'localAISTT' + LOCALAI_STT: 'localAISTT', + GROQ_WHISPER: 'groqWhisper' } // Weird quirk - the key must match the name property value. @@ -139,6 +141,46 @@ const speechToTextProviders = { optional: true } ] + }, + [SpeechToTextType.GROQ_WHISPER]: { + label: 'Groq Whisper', + name: SpeechToTextType.GROQ_WHISPER, + icon: groqPng, + url: 'https://console.groq.com/', + inputs: [ + { + label: 'Model', + name: 'model', + type: 'string', + description: `The STT model to load. Defaults to whisper-large-v3 if left blank.`, + placeholder: 'whisper-large-v3', + optional: true + }, + { + label: 'Connect Credential', + name: 'credential', + type: 'credential', + credentialNames: ['groqApi'] + }, + { + label: 'Language', + name: 'language', + type: 'string', + description: + 'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.', + placeholder: 'en', + optional: true + }, + { + label: 'Temperature', + name: 'temperature', + type: 'number', + step: 0.1, + description: + 'The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.', + optional: true + } + ] } } @@ -210,6 +252,9 @@ const SpeechToText = ({ dialogProps }) => { newVal[provider.name] = { ...speechToText[provider.name], status: false } } }) + if (providerName !== 'none') { + newVal['none'].status = false + } } setSpeechToText(newVal) return newVal