Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ REDIS_DB_PASSWORD=

SONIOX_API_KEY=
DEEPGRAM_API_KEY=
ELEVENLABS_API_KEY=

ADMIN_KEY=
OPENAI_API_KEY=
Expand Down
2 changes: 2 additions & 0 deletions backend/database/conversations.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,12 +767,14 @@ def store_model_emotion_predictions_result(
def get_conversation_transcripts_by_model(uid: str, conversation_id: str):
user_ref = db.collection('users').document(uid)
conversation_ref = user_ref.collection(conversations_collection).document(conversation_id)
elevenlabs_ref = conversation_ref.collection('elevenlabs_scribe')
deepgram_ref = conversation_ref.collection('deepgram_streaming')
soniox_ref = conversation_ref.collection('soniox_streaming')
speechmatics_ref = conversation_ref.collection('speechmatics_streaming')
whisperx_ref = conversation_ref.collection('fal_whisperx')

return {
'elevenlabs': list(sorted([doc.to_dict() for doc in elevenlabs_ref.stream()], key=lambda x: x['start'])),
'deepgram': list(sorted([doc.to_dict() for doc in deepgram_ref.stream()], key=lambda x: x['start'])),
'soniox': list(sorted([doc.to_dict() for doc in soniox_ref.stream()], key=lambda x: x['start'])),
'speechmatics': list(sorted([doc.to_dict() for doc in speechmatics_ref.stream()], key=lambda x: x['start'])),
Expand Down
2 changes: 2 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ cycler==0.12.1
dataclasses-json==0.6.7
decorator==5.1.1
deepgram-sdk==3.4.0
elevenlabs==2.22.1
websockets==14.1
deprecation==2.1.0
distro==1.9.0
dnspython==2.6.1
Expand Down
40 changes: 38 additions & 2 deletions backend/routers/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
process_audio_dg,
process_audio_soniox,
process_audio_speechmatics,
process_audio_elevenlabs,
send_initial_file_path,
)
from utils.subscription import has_transcription_credits
Expand Down Expand Up @@ -606,6 +607,8 @@ def _update_in_progress_conversation(
return

# Process STT
elevenlabs_socket = None
elevenlabs_socket2 = None
soniox_socket = None
soniox_socket2 = None
speechmatics_socket = None
Expand All @@ -622,6 +625,8 @@ def stream_transcript(segments):

async def _process_stt():
nonlocal websocket_close_code
nonlocal elevenlabs_socket
nonlocal elevenlabs_socket2
nonlocal soniox_socket
nonlocal soniox_socket2
nonlocal speechmatics_socket
Expand Down Expand Up @@ -699,6 +704,21 @@ async def deepgram_socket_send(data):

safe_create_task(send_initial_file_path(file_path, soniox_socket.send))
print('speech_profile soniox duration', speech_profile_duration, uid, session_id)
# ELEVENLABS SCRIBE
elif stt_service == STTService.elevenlabs:
elevenlabs_socket = await process_audio_elevenlabs(
stream_transcript, sample_rate, stt_language, preseconds=speech_profile_duration, model=stt_model
)
if speech_profile_duration and file_path:
elevenlabs_socket2 = await process_audio_elevenlabs(
stream_transcript, sample_rate, stt_language, preseconds=0, model=stt_model
)

async def elevenlabs_socket_send(data):
return await elevenlabs_socket.send(data)

safe_create_task(send_initial_file_path(file_path, elevenlabs_socket_send))
print('speech_profile elevenlabs duration', speech_profile_duration, uid, session_id)
# SPEECHMATICS
elif stt_service == STTService.speechmatics:
speechmatics_socket = await process_audio_speechmatics(
Expand Down Expand Up @@ -1143,7 +1163,7 @@ async def handle_image_chunk(
elif codec == 'lc3':
lc3_decoder = lc3.Decoder(lc3_frame_duration_us, sample_rate)

async def receive_data(dg_socket1, dg_socket2, soniox_socket, soniox_socket2, speechmatics_socket1):
async def receive_data(dg_socket1, dg_socket2, elevenlabs_socket1, elevenlabs_socket2, soniox_socket, soniox_socket2, speechmatics_socket1):
nonlocal websocket_active, websocket_close_code, last_audio_received_time, current_conversation_id
nonlocal realtime_photo_buffers, speech_profile_processed, speaker_to_person_map, first_audio_byte_timestamp, last_usage_record_timestamp

Expand Down Expand Up @@ -1201,6 +1221,18 @@ async def receive_data(dg_socket1, dg_socket2, soniox_socket, soniox_socket2, sp
)
continue

if elevenlabs_socket1 is not None:
elapsed_seconds = time.time() - timer_start
if elapsed_seconds > speech_profile_duration or not elevenlabs_socket2:
await elevenlabs_socket1.send(data)
if elevenlabs_socket2:
print('Killing elevenlabs_socket2', uid, session_id)
await elevenlabs_socket2.close()
elevenlabs_socket2 = None
speech_profile_processed = True
else:
await elevenlabs_socket2.send(data)

if soniox_socket is not None:
elapsed_seconds = time.time() - timer_start
if elapsed_seconds > speech_profile_duration or not soniox_socket2:
Expand Down Expand Up @@ -1303,7 +1335,7 @@ async def receive_data(dg_socket1, dg_socket2, soniox_socket, soniox_socket2, sp

# Tasks
data_process_task = asyncio.create_task(
receive_data(deepgram_socket, deepgram_socket2, soniox_socket, soniox_socket2, speechmatics_socket)
receive_data(deepgram_socket, deepgram_socket2, elevenlabs_socket, elevenlabs_socket2, soniox_socket, soniox_socket2, speechmatics_socket)
)
stream_transcript_task = asyncio.create_task(stream_transcript_process())
record_usage_task = asyncio.create_task(_record_usage_periodically())
Expand Down Expand Up @@ -1336,6 +1368,10 @@ async def receive_data(dg_socket1, dg_socket2, soniox_socket, soniox_socket2, sp
deepgram_socket.finish()
if deepgram_socket2:
deepgram_socket2.finish()
if elevenlabs_socket:
await elevenlabs_socket.close()
if elevenlabs_socket2:
await elevenlabs_socket2.close()
if soniox_socket:
await soniox_socket.close()
if soniox_socket2:
Expand Down
11 changes: 9 additions & 2 deletions backend/scripts/stt/k_compare_transcripts_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
firebase_admin.initialize_app()

from models.transcript_segment import TranscriptSegment
from utils.stt.streaming import process_audio_dg, process_audio_soniox, process_audio_speechmatics
from utils.stt.streaming import process_audio_dg, process_audio_soniox, process_audio_speechmatics, process_audio_elevenlabs
from groq import Groq

from utils.other.storage import upload_postprocessing_audio
Expand Down Expand Up @@ -86,7 +86,7 @@ async def _execute_single(file_path: str):
return

print('Started processing', memory_id, 'duration', aseg.duration_seconds)
result = {'deepgram': [], 'soniox': [], 'speechmatics': []}
result = {'elevenlabs': [], 'deepgram': [], 'soniox': [], 'speechmatics': []}

def stream_transcript_deepgram(new_segments, _):
print('stream_transcript_deepgram', new_segments)
Expand All @@ -99,11 +99,16 @@ def stream_transcript_soniox(new_segments, _):
def stream_transcript_speechmatics(new_segments, _):
print('stream_transcript_speechmatics', new_segments)
add_model_result_segments('speechmatics', new_segments, result)

def stream_transcript_elevenlabs(new_segments, _):
print('stream_transcript_elevenlabs', new_segments)
add_model_result_segments('elevenlabs', new_segments, result)

# streaming models
socket = await process_audio_dg(stream_transcript_deepgram, '1', 'en', 16000, 'pcm16', 1, 0)
socket_soniox = await process_audio_soniox(stream_transcript_soniox, '1', 16000, 'en', None)
socket_speechmatics = await process_audio_speechmatics(stream_transcript_speechmatics, '1', 16000, 'en', 0)
socket_elevenlabs = await process_audio_elevenlabs(stream_transcript_elevenlabs, 16000, 'eng', 0)
print('duration', duration)
with open(file_path, "rb") as file:
while True:
Expand All @@ -113,6 +118,7 @@ def stream_transcript_speechmatics(new_segments, _):
socket.send(bytes(chunk))
await socket_soniox.send(bytes(chunk))
await socket_speechmatics.send(bytes(chunk))
await socket_elevenlabs.send(bytes(chunk))
await asyncio.sleep(0.005)

print('Finished sending audio')
Expand All @@ -139,6 +145,7 @@ def stream_transcript_speechmatics(new_segments, _):
socket.finish()
await socket_soniox.close()
await socket_speechmatics.close()
await socket_elevenlabs.close()


def batched(iterable, n):
Expand Down
64 changes: 64 additions & 0 deletions backend/test_elevenlabs_stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python3
import os
import asyncio
from io import BytesIO
from utils.stt.streaming import process_audio_elevenlabs, get_stt_service_for_language

async def test_elevenlabs_stt():
print("Testing ElevenLabs Scribe STT integration...")

# Check if API key is set
api_key = os.getenv('ELEVENLABS_API_KEY')
if not api_key:
print("ERROR: ELEVENLABS_API_KEY environment variable is not set")
return False

# Test service selection
service, language, model = get_stt_service_for_language('en')
print(f"Selected STT service: {service}, language: {language}, model: {model}")

if service != 'elevenlabs':
print("WARNING: ElevenLabs is not the primary STT service")
print("Make sure STT_SERVICE_MODELS environment variable includes 'el-scribe' first")

# Test creating ElevenLabs socket
segments_received = []

def stream_transcript(segments):
print(f"Received segments: {segments}")
segments_received.extend(segments)

try:
socket = await process_audio_elevenlabs(
stream_transcript,
sample_rate=16000,
language='eng',
preseconds=0,
model='scribe_v2_realtime'
)

print("ElevenLabs socket created successfully")

dummy_audio = b'\x00' * 3200
await socket.send(dummy_audio)

await asyncio.sleep(2)

await socket.close()

print("ElevenLabs socket closed successfully")
return True

except Exception as e:
print(f"ERROR: Failed to test ElevenLabs STT: {e}")
return False

if __name__ == "__main__":
if not os.getenv('STT_SERVICE_MODELS'):
os.environ['STT_SERVICE_MODELS'] = 'el-scribe'

result = asyncio.run(test_elevenlabs_stt())
if result:
print("\n✅ Test PASSED")
else:
print("\n❌ Test FAILED")
Loading