-
Notifications
You must be signed in to change notification settings - Fork 163
Description
Describe the bug
Hey, So I am trying to make a voice agent that can be deployed on a website.
The Website frontend is made using React with python-flask as backend
The problem is the agent shows an inconsistent behavior where it sometimes joins the room and sometimes does not. Because of this inconsistency I am unable to identify what the problem is.
Relevant log output
No response
Describe your environment
I am running this whole code in production environment, with the backend on EC2, OS=Ubuntu and frontend on Vercel
Minimal reproducible example
agent.py
from dotenv import load_dotenv
import asyncio
import os
import logging
import traceback
from livekit import agents
from livekit.agents import AgentSession, Agent, RoomInputOptions, JobRequest
from livekit.plugins import (
openai,
cartesia,
deepgram,
silero,
elevenlabs
)
from livekit.agents.voice.agent import ModelSettings
from livekit.agents.llm import ChatMessage
import livekit.agents.llm as livekit_llm
# from livekit.plugins.turn_detector.multilingual import MultilingualModel
# from livekit.agents import ChatContext
load_dotenv()
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
class Assistant(Agent):
def __init__(self,session:AgentSession,index):
base_dir = os.path.dirname(os.path.abspath(__file__)) # points to voice_agent/
prompt_path = os.path.join(base_dir, "prompt_template.txt")
with open(prompt_path, "r", encoding="utf-8") as f:
instructions = f.read()
super().__init__(instructions=instructions)
self._instructions=instructions
self.index = index
self._session=session
self.interaction_count = 0
async def llm_node(
self,
chat_ctx: livekit_llm.ChatContext,
tools: list[livekit_llm.FunctionTool],
model_settings:ModelSettings
):
self.interaction_count += 1
chat_ctx_to_use = livekit_llm.ChatContext()
# 1. Get user query (latest message)
user_query = ""
if chat_ctx.items and isinstance(chat_ctx.items[-1], ChatMessage) and chat_ctx.items[-1].role == "user":
user_query = chat_ctx.items[-1].text_content or ""
context = self._instructions + "\n\n"
# if self.interaction_count == 1:
# # First message: greet and ask
# chat_ctx_to_use.items.append(ChatMessage(role="assistant", content=[
# "Hi! I'm your UnBroker assistant, here to help you explore luxury properties. Could you tell me your preferred location or budget so I can better assist you?"
# ]))
# return # Skip LLM generation — this is a static greeting
if user_query.strip():
docs = self.index.similarity_search(user_query, k=5)
for doc in docs:
context += doc.page_content + "\n"
chat_ctx_to_use.items.append(ChatMessage(role="system", content=[context]))
conversation_history = [
msg for msg in chat_ctx.items if msg.role != "system"
][-3:]
chat_ctx_to_use.items.extend(conversation_history)
print("======== Final chat_ctx_to_use ========")
for msg in chat_ctx_to_use.items:
print(f"{msg.role.upper()}: {msg.content}")
first_chunk = True
async for chunk in Agent.default.llm_node(self, chat_ctx_to_use, tools,model_settings):
# if first_chunk:
# llm_response_received_time = time.time()
# llm_processing_time = llm_response_received_time - llm_query_sent_time
# logger.info(f"LLM query received at {llm_response_received_time:.2f} (Processing time: {llm_processing_time:.2f} seconds)")
# logger.info(f"TTS start at {llm_response_received_time:.2f}")
# first_chunk = False
yield chunk
async def request_fnc(req: JobRequest):
# accept the job request
await req.accept(
# the agent's name (Participant.name), defaults to ""
name="agent",
# the agent's identity (Participant.identity), defaults to "agent-<jobid>"
identity="identity",
# attributes to set on the agent participant upon join
attributes={"myagent": "rocks"},
)
async def entrypoint(ctx: agents.JobContext):
try:
from rag_utils import load_faiss_vectorstore
try:
vectorstore = load_faiss_vectorstore()
print("[Agent] Vectorstore loaded successfully.", flush=True)
except Exception as e:
print("[ERROR] Failed to load vectorstore", flush=True)
traceback.print_exc()
await ctx.connect()
try:
session = AgentSession(
stt=deepgram.STT(model="nova-3", language="multi"),
llm=openai.LLM(model="gpt-4o"),
# tts=cartesia.TTS(model="sonic-2", voice="f786b574-daa5-4673-aa0c-cbe3e8534c02"),
tts=elevenlabs.TTS(
voice_id="VJzrUxHaC52mTyYHMCnK",
model="eleven_turbo_v2_5"
),
vad=silero.VAD.load(),
# turn_detection=MultilingualModel(),
)
except Exception as e:
logging.error(f"Failed to initialize session: {str(e)}")
return
try:
agent = Assistant(session=session, index=vectorstore)
except Exception as e:
logging.error(f"Failed to initialize Assistant: {str(e)}")
return
try:
await session.start(
room=ctx.room,
agent=agent,
room_input_options=RoomInputOptions(),
)
except Exception as e:
logging.error(f"Failed to start session: {str(e)}")
return
await session.generate_reply(
instructions="Hi, welcome to Unbroker"
)
except Exception as e:
logging.error(f"An unexpected error occurred: {str(e)}")
if __name__ == "__main__":
agents.cli.run_app(agents.WorkerOptions(entrypoint_fnc=entrypoint,request_fnc=request_fnc))
token generation route
@app.route('/getToken', methods=["POST"])
def get_token():
data = request.get_json()
identity = data.get("identity")
room = data.get("room")
if not identity or not room:
return jsonify({"error": "Missing identity or room"}), 400
token = api.AccessToken(
os.getenv("LIVEKIT_API_KEY"),
os.getenv("LIVEKIT_API_SECRET"),
).with_identity(identity).with_grants(
api.VideoGrants(
room_join=True,
room=room,
can_publish=True,
can_subscribe=True,
)
)
print(token.to_jwt())
return jsonify({"token": token.to_jwt()})
minimalist frontend code:
import React, { useState, useRef } from 'react';
import { Room, createLocalAudioTrack } from 'livekit-client';
const BACKEND_URL = import.meta.env.VITE_API_BASE;
const LIVEKIT_WS_URL = 'wss://ds-nl2qsdc2.livekit.cloud';
export default function VoiceAgent() {
const [roomName, setRoomName] = useState(`room-${Math.random().toString(36).slice(2, 8)}`);
const [userId, setUserId] = useState(`frontend-${Math.random().toString(36).slice(2, 8)}`);
const roomRef = useRef(null);
const handleJoin = async () => {
try {
// 🔌 Disconnect existing room if needed
if (roomRef.current) {
console.log('🔌 Disconnecting previous room');
roomRef.current.disconnect();
roomRef.current = null;
}
// 🔓 iOS fix: request mic access and resume AudioContext
await navigator.mediaDevices.getUserMedia({ audio: true });
if (typeof AudioContext !== 'undefined') {
const audioCtx = new AudioContext();
if (audioCtx.state === 'suspended') await audioCtx.resume();
}
// 🔑 Step 1: Get token
const tokenRes = await fetch(`${BACKEND_URL}/getToken`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ identity: userId, room: roomName }),
});
const { token } = await tokenRes.json();
if (!token) throw new Error('❌ No token received from backend');
console.log('🔑 Received token');
// 🧠 Step 2: Create room
const room = new Room();
roomRef.current = room;
// 📡 Step 3: Set event listeners
room.on('connected', () => console.log('✅ Connected to LiveKit room'));
room.on('disconnected', () => console.log('🔌 Disconnected from room'));
room.on('connectionError', err => console.error('❗ Connection error:', err));
room.on('participantConnected', p => console.log('👤 Participant joined:', p.identity));
room.on('participantDisconnected', p => console.log('👋 Participant left:', p.identity));
// 🔗 Step 4: Connect to server
await room.connect(LIVEKIT_WS_URL, token);
// 🎤 Step 5: Create and publish mic track
const micTrack = await createLocalAudioTrack();
console.log('🎙️ Mic track created:', micTrack);
// Optional mic test (hear yourself)
const micTestEl = micTrack.attach();
micTestEl.volume = 0.3;
micTestEl.autoplay = true;
document.body.appendChild(micTestEl);
await room.localParticipant.publishTrack(micTrack);
console.log('📢 Mic published');
// 🔉 Step 6: Subscribe to remote audio
room.on('trackSubscribed', (track, publication, participant) => {
if (track.kind === 'audio') {
console.log(`🔈 Subscribed to audio from ${participant.identity}`);
const audioEl = track.attach();
audioEl.autoplay = true;
document.body.appendChild(audioEl);
}
});
} catch (err) {
console.error('❌ Error joining room:', err);
alert('Join failed. See console.');
}
};
return (
<div style={{ padding: 20 }}>
<h2>🎙️ LiveKit Voice Join</h2>
<label>User ID: <input value={userId} onChange={e => setUserId(e.target.value)} /></label><br />
<label>Room: <input value={roomName} onChange={e => setRoomName(e.target.value)} /></label><br />
<button className="text-white" onClick={handleJoin}>Join Room</button>
</div>
);
}
Additional information
In the frontend logs, when the agent connects, I get "Participant joined: identity" and "Subscribed to audio from identity" along with other 4 logs. But when it does not connect, I get only the 4 other logs and no Participant joined.. or subscribed to audio.. log.
This is when the agent connects. When it does not, the last two logs dont show up.