BasedHardware · neooriginal · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/backend/.env.template b/backend/.env.template
@@ -14,6 +14,7 @@ REDIS_DB_PASSWORD=
 
 SONIOX_API_KEY=
 DEEPGRAM_API_KEY=
+ELEVENLABS_API_KEY=
 
 ADMIN_KEY=
 OPENAI_API_KEY=

diff --git a/backend/database/conversations.py b/backend/database/conversations.py
@@ -767,12 +767,14 @@ def store_model_emotion_predictions_result(
 def get_conversation_transcripts_by_model(uid: str, conversation_id: str):
     user_ref = db.collection('users').document(uid)
     conversation_ref = user_ref.collection(conversations_collection).document(conversation_id)
+    elevenlabs_ref = conversation_ref.collection('elevenlabs_scribe')
     deepgram_ref = conversation_ref.collection('deepgram_streaming')
     soniox_ref = conversation_ref.collection('soniox_streaming')
     speechmatics_ref = conversation_ref.collection('speechmatics_streaming')
     whisperx_ref = conversation_ref.collection('fal_whisperx')
 
     return {
+        'elevenlabs': list(sorted([doc.to_dict() for doc in elevenlabs_ref.stream()], key=lambda x: x['start'])),
         'deepgram': list(sorted([doc.to_dict() for doc in deepgram_ref.stream()], key=lambda x: x['start'])),
         'soniox': list(sorted([doc.to_dict() for doc in soniox_ref.stream()], key=lambda x: x['start'])),
         'speechmatics': list(sorted([doc.to_dict() for doc in speechmatics_ref.stream()], key=lambda x: x['start'])),

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -33,6 +33,8 @@ cycler==0.12.1
 dataclasses-json==0.6.7
 decorator==5.1.1
 deepgram-sdk==3.4.0
+elevenlabs==2.22.1
+websockets==14.1
 deprecation==2.1.0
 distro==1.9.0
 dnspython==2.6.1

diff --git a/backend/routers/transcribe.py b/backend/routers/transcribe.py
@@ -66,6 +66,7 @@
     process_audio_dg,
     process_audio_soniox,
     process_audio_speechmatics,
+    process_audio_elevenlabs,
     send_initial_file_path,
 )
 from utils.subscription import has_transcription_credits
@@ -606,6 +607,8 @@ def _update_in_progress_conversation(
         return
 
     # Process STT
+    elevenlabs_socket = None
+    elevenlabs_socket2 = None
     soniox_socket = None
     soniox_socket2 = None
     speechmatics_socket = None
@@ -622,6 +625,8 @@ def stream_transcript(segments):
 
     async def _process_stt():
         nonlocal websocket_close_code
+        nonlocal elevenlabs_socket
+        nonlocal elevenlabs_socket2
         nonlocal soniox_socket
         nonlocal soniox_socket2
         nonlocal speechmatics_socket
@@ -699,6 +704,21 @@ async def deepgram_socket_send(data):
 
                     safe_create_task(send_initial_file_path(file_path, soniox_socket.send))
                     print('speech_profile soniox duration', speech_profile_duration, uid, session_id)
+            # ELEVENLABS SCRIBE
+            elif stt_service == STTService.elevenlabs:
+                elevenlabs_socket = await process_audio_elevenlabs(
+                    stream_transcript, sample_rate, stt_language, preseconds=speech_profile_duration, model=stt_model
+                )
+                if speech_profile_duration and file_path:
+                    elevenlabs_socket2 = await process_audio_elevenlabs(
+                        stream_transcript, sample_rate, stt_language, preseconds=0, model=stt_model
+                    )
+
+                    async def elevenlabs_socket_send(data):
+                        return await elevenlabs_socket.send(data)
+
+                    safe_create_task(send_initial_file_path(file_path, elevenlabs_socket_send))
+                    print('speech_profile elevenlabs duration', speech_profile_duration, uid, session_id)
             # SPEECHMATICS
             elif stt_service == STTService.speechmatics:
                 speechmatics_socket = await process_audio_speechmatics(
@@ -1143,7 +1163,7 @@ async def handle_image_chunk(
     elif codec == 'lc3':
         lc3_decoder = lc3.Decoder(lc3_frame_duration_us, sample_rate)
 
-    async def receive_data(dg_socket1, dg_socket2, soniox_socket, soniox_socket2, speechmatics_socket1):
+    async def receive_data(dg_socket1, dg_socket2, elevenlabs_socket1, elevenlabs_socket2, soniox_socket, soniox_socket2, speechmatics_socket1):
         nonlocal websocket_active, websocket_close_code, last_audio_received_time, current_conversation_id
         nonlocal realtime_photo_buffers, speech_profile_processed, speaker_to_person_map, first_audio_byte_timestamp, last_usage_record_timestamp
 
@@ -1201,6 +1221,18 @@ async def receive_data(dg_socket1, dg_socket2, soniox_socket, soniox_socket2, sp
                             )
                             continue
 
+                    if elevenlabs_socket1 is not None:
+                        elapsed_seconds = time.time() - timer_start
+                        if elapsed_seconds > speech_profile_duration or not elevenlabs_socket2:
+                            await elevenlabs_socket1.send(data)
+                            if elevenlabs_socket2:
+                                print('Killing elevenlabs_socket2', uid, session_id)
+                                await elevenlabs_socket2.close()
+                                elevenlabs_socket2 = None
+                                speech_profile_processed = True
+                        else:
+                            await elevenlabs_socket2.send(data)
+
                     if soniox_socket is not None:
                         elapsed_seconds = time.time() - timer_start
                         if elapsed_seconds > speech_profile_duration or not soniox_socket2:
@@ -1303,7 +1335,7 @@ async def receive_data(dg_socket1, dg_socket2, soniox_socket, soniox_socket2, sp
 
         # Tasks
         data_process_task = asyncio.create_task(
-            receive_data(deepgram_socket, deepgram_socket2, soniox_socket, soniox_socket2, speechmatics_socket)
+            receive_data(deepgram_socket, deepgram_socket2, elevenlabs_socket, elevenlabs_socket2, soniox_socket, soniox_socket2, speechmatics_socket)
         )
         stream_transcript_task = asyncio.create_task(stream_transcript_process())
         record_usage_task = asyncio.create_task(_record_usage_periodically())
@@ -1336,6 +1368,10 @@ async def receive_data(dg_socket1, dg_socket2, soniox_socket, soniox_socket2, sp
                 deepgram_socket.finish()
             if deepgram_socket2:
                 deepgram_socket2.finish()
+            if elevenlabs_socket:
+                await elevenlabs_socket.close()
+            if elevenlabs_socket2:
+                await elevenlabs_socket2.close()
             if soniox_socket:
                 await soniox_socket.close()
             if soniox_socket2:

diff --git a/backend/scripts/stt/k_compare_transcripts_performance.py b/backend/scripts/stt/k_compare_transcripts_performance.py
@@ -28,7 +28,7 @@
 firebase_admin.initialize_app()
 
 from models.transcript_segment import TranscriptSegment
-from utils.stt.streaming import process_audio_dg, process_audio_soniox, process_audio_speechmatics
+from utils.stt.streaming import process_audio_dg, process_audio_soniox, process_audio_speechmatics, process_audio_elevenlabs
 from groq import Groq
 
 from utils.other.storage import upload_postprocessing_audio
@@ -86,7 +86,7 @@ async def _execute_single(file_path: str):
         return
 
     print('Started processing', memory_id, 'duration', aseg.duration_seconds)
-    result = {'deepgram': [], 'soniox': [], 'speechmatics': []}
+    result = {'elevenlabs': [], 'deepgram': [], 'soniox': [], 'speechmatics': []}
 
     def stream_transcript_deepgram(new_segments, _):
         print('stream_transcript_deepgram', new_segments)
@@ -99,11 +99,16 @@ def stream_transcript_soniox(new_segments, _):
     def stream_transcript_speechmatics(new_segments, _):
         print('stream_transcript_speechmatics', new_segments)
         add_model_result_segments('speechmatics', new_segments, result)
+
+    def stream_transcript_elevenlabs(new_segments, _):
+        print('stream_transcript_elevenlabs', new_segments)
+        add_model_result_segments('elevenlabs', new_segments, result)
 
     # streaming models
     socket = await process_audio_dg(stream_transcript_deepgram, '1', 'en', 16000, 'pcm16', 1, 0)
     socket_soniox = await process_audio_soniox(stream_transcript_soniox, '1', 16000, 'en', None)
     socket_speechmatics = await process_audio_speechmatics(stream_transcript_speechmatics, '1', 16000, 'en', 0)
+    socket_elevenlabs = await process_audio_elevenlabs(stream_transcript_elevenlabs, 16000, 'eng', 0)
     print('duration', duration)
     with open(file_path, "rb") as file:
         while True:
@@ -113,6 +118,7 @@ def stream_transcript_speechmatics(new_segments, _):
             socket.send(bytes(chunk))
             await socket_soniox.send(bytes(chunk))
             await socket_speechmatics.send(bytes(chunk))
+            await socket_elevenlabs.send(bytes(chunk))
             await asyncio.sleep(0.005)
 
     print('Finished sending audio')
@@ -139,6 +145,7 @@ def stream_transcript_speechmatics(new_segments, _):
     socket.finish()
     await socket_soniox.close()
     await socket_speechmatics.close()
+    await socket_elevenlabs.close()
 
 
 def batched(iterable, n):

diff --git a/backend/test_elevenlabs_stt.py b/backend/test_elevenlabs_stt.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+import os
+import asyncio
+from io import BytesIO
+from utils.stt.streaming import process_audio_elevenlabs, get_stt_service_for_language
+
+async def test_elevenlabs_stt():
+    print("Testing ElevenLabs Scribe STT integration...")
+
+    # Check if API key is set
+    api_key = os.getenv('ELEVENLABS_API_KEY')
+    if not api_key:
+        print("ERROR: ELEVENLABS_API_KEY environment variable is not set")
+        return False
+
+    # Test service selection
+    service, language, model = get_stt_service_for_language('en')
+    print(f"Selected STT service: {service}, language: {language}, model: {model}")
+
+    if service != 'elevenlabs':
+        print("WARNING: ElevenLabs is not the primary STT service")
+        print("Make sure STT_SERVICE_MODELS environment variable includes 'el-scribe' first")
+
+    # Test creating ElevenLabs socket
+    segments_received = []
+
+    def stream_transcript(segments):
+        print(f"Received segments: {segments}")
+        segments_received.extend(segments)
+
+    try:
+        socket = await process_audio_elevenlabs(
+            stream_transcript, 
+            sample_rate=16000, 
+            language='eng', 
+            preseconds=0,
+            model='scribe_v2_realtime'
+        )
+
+        print("ElevenLabs socket created successfully")
+
+        dummy_audio = b'\x00' * 3200
+        await socket.send(dummy_audio)
+
+        await asyncio.sleep(2)
+
+        await socket.close()
+
+        print("ElevenLabs socket closed successfully")
+        return True
+
+    except Exception as e:
+        print(f"ERROR: Failed to test ElevenLabs STT: {e}")
+        return False
+
+if __name__ == "__main__":
+    if not os.getenv('STT_SERVICE_MODELS'):
+        os.environ['STT_SERVICE_MODELS'] = 'el-scribe'
+
+    result = asyncio.run(test_elevenlabs_stt())
+    if result:
+        print("\n✅ Test PASSED")
+    else:
+        print("\n❌ Test FAILED")