Open
Description
We should allow the realtime API to only respond in text, as it is natively supported.
https://platform.openai.com/docs/api-reference/realtime
Currently (If I'm interpreting the behavior right) LiveKit treats text responses as errors, even if the modality is set to text only.
Today I used patch-package to patch @livekit/agents@0.7.0
for the project I'm working on.
Here is the diff that solved my problem:
diff --git a/node_modules/@livekit/agents/dist/multimodal/multimodal_agent.d.ts b/node_modules/@livekit/agents/dist/multimodal/multimodal_agent.d.ts
index 3902cda..a6a3c1a 100644
--- a/node_modules/@livekit/agents/dist/multimodal/multimodal_agent.d.ts
+++ b/node_modules/@livekit/agents/dist/multimodal/multimodal_agent.d.ts
@@ -34,11 +34,12 @@ export declare class MultimodalAgent extends EventEmitter {
linkedParticipant: RemoteParticipant | null;
subscribedTrack: RemoteAudioTrack | null;
readMicroTask: Promise<void> | null;
- constructor({ model, chatCtx, fncCtx, maxTextResponseRetries, }: {
+ constructor({ model, chatCtx, fncCtx, maxTextResponseRetries, allowTextReplies, }: {
model: RealtimeModel;
chatCtx?: llm.ChatContext;
fncCtx?: llm.FunctionContext;
maxTextResponseRetries?: number;
+ allowTextReplies?: boolean;
});
get fncCtx(): llm.FunctionContext | undefined;
set fncCtx(ctx: llm.FunctionContext | undefined);
diff --git a/node_modules/@livekit/agents/dist/multimodal/multimodal_agent.js b/node_modules/@livekit/agents/dist/multimodal/multimodal_agent.js
index b9f9f54..c61d375 100644
--- a/node_modules/@livekit/agents/dist/multimodal/multimodal_agent.js
+++ b/node_modules/@livekit/agents/dist/multimodal/multimodal_agent.js
@@ -26,17 +26,20 @@ class MultimodalAgent extends EventEmitter {
readMicroTask = null;
#textResponseRetries = 0;
#maxTextResponseRetries;
+ #allowTextReplies = false;
constructor({
model,
chatCtx,
fncCtx,
- maxTextResponseRetries = 5
+ maxTextResponseRetries = 5,
+ allowTextReplies = false,
}) {
super();
this.model = model;
this.#chatCtx = chatCtx;
this.#fncCtx = fncCtx;
this.#maxTextResponseRetries = maxTextResponseRetries;
+ this.#allowTextReplies = allowTextReplies;
}
#participant = null;
#agentPublication = null;
@@ -186,7 +189,7 @@ class MultimodalAgent extends EventEmitter {
this.#playingHandle = handle;
});
this.#session.on("response_content_done", (message) => {
- if (message.contentType === "text") {
+ if (message.contentType === "text" && !this.#allowTextReplies) {
if (this.#textResponseRetries >= this.#maxTextResponseRetries) {
throw new Error(
`The OpenAI Realtime API returned a text response after ${this.#maxTextResponseRetries} retries. Please try to reduce the number of text system or assistant messages in the chat context.`
This issue body was partially generated by patch-package.
Metadata
Metadata
Assignees
Labels
No labels