Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/voice-agent-noisy-tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ This guide documents the runtime tuning path used by `src/lib/agents/realtime/vo
- Config source: `src/lib/agents/realtime/voice-agent/config.ts`
- Runtime guards: `src/lib/agents/realtime/voice-agent/runtime-guards.ts`
- Realtime model wiring:
- `modelTransport` (`realtime` | `responses_ws`)
- `resolvedRealtimeModel` (adaptive profile aware)
- `resolvedResponsesModel`
- `inputAudioTranscription`
- `inputAudioNoiseReduction`
- `turnDetection`
Expand All @@ -17,6 +20,12 @@ This guide documents the runtime tuning path used by `src/lib/agents/realtime/vo
## Primary Knobs

- `VOICE_AGENT_TRANSCRIPTION_ENABLED`
- `VOICE_AGENT_MODEL_TRANSPORT` (`realtime` | `responses_ws`)
- `VOICE_AGENT_RESPONSES_MODEL` (defaults to `gpt-audio-1.5`)
- `VOICE_AGENT_REALTIME_MODEL_STRATEGY` (`fixed` | `adaptive_profile`)
- `VOICE_AGENT_REALTIME_MODEL_PRIMARY`
- `VOICE_AGENT_REALTIME_MODEL_SECONDARY`
- `VOICE_AGENT_REALTIME_MODEL` (explicit override)
- `VOICE_AGENT_INPUT_TRANSCRIPTION_MODEL`
- `VOICE_AGENT_TRANSCRIPTION_LANGUAGE`
- `VOICE_AGENT_TURN_DETECTION`
Expand Down
6 changes: 6 additions & 0 deletions example.env.local
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ AGENT_LLM_MODEL=gpt-4.1 # (Optional) LLM
AGENT_STT_MODEL=your-stt-model-name # (Optional) STT model for agent (e.g., whisper-large-v3-turbo)
AGENT_TTS_MODEL=your-tts-model-name # (Optional) TTS model for agent (e.g., tts-1)
AGENT_STT_LANGUAGE=en # (Optional) STT language (e.g., en)
VOICE_AGENT_MODEL_TRANSPORT=responses_ws # (Optional) realtime | responses_ws
VOICE_AGENT_RESPONSES_MODEL=gpt-audio-1.5 # (Optional) Responses WS model for tool-calling turns
VOICE_AGENT_REALTIME_MODEL_STRATEGY=adaptive_profile # (Optional) fixed | adaptive_profile
VOICE_AGENT_REALTIME_MODEL_PRIMARY=gpt-realtime-1.5 # (Optional) Primary realtime model for adaptive profile
VOICE_AGENT_REALTIME_MODEL_SECONDARY=gpt-realtime-mini # (Optional) Secondary realtime model for adaptive profile
VOICE_AGENT_REALTIME_MODEL= # (Optional) Explicit realtime model override (wins over strategy/primary/secondary)
VOICE_AGENT_INPUT_TRANSCRIPTION_MODEL=gpt-4o-mini-transcribe # (Optional) Voice agent STT model; falls back to AGENT_STT_MODEL
VOICE_AGENT_TRANSCRIPTION_ENABLED=true # (Optional) Enable realtime STT (defaults to true when model provided and multi-participant mode is off)
VOICE_AGENT_TRANSCRIPTION_LANGUAGE=en # (Optional) Language hint for realtime transcription (defaults to AGENT_STT_LANGUAGE)
Expand Down
37 changes: 36 additions & 1 deletion src/app/settings/models/_lib/guided-config.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
import type { GuidedField, GuidedSection, ResolvedFieldSource } from './types';

const MODEL_SUGGESTIONS = {
openai: ['gpt-realtime', 'gpt-5-mini', 'gpt-4o-mini-transcription', 'whisper-1'],
openai: [
'gpt-realtime-1.5',
'gpt-realtime-mini',
'gpt-audio-1.5',
'gpt-audio-mini',
'gpt-5-mini',
'gpt-4o-mini-transcription',
'whisper-1',
],
anthropic: ['claude-haiku-4-5', 'claude-sonnet-4-5'],
google: ['gemini-2.5-pro', 'gemini-2.5-flash', 'gemini-3-pro-image-preview'],
cerebras: ['llama3.3-70b', 'gpt-oss-120b', 'qwen3-32b', 'llama3.1-8b'],
Expand All @@ -28,6 +36,20 @@ export const GUIDED_SECTIONS: GuidedSection[] = [
help: 'Used by realtime voice AgentSession model constructor.',
suggestions: MODEL_SUGGESTIONS.openai,
},
{
path: 'models.voiceRealtimePrimary',
label: 'Voice Realtime Primary Model',
kind: 'string',
help: 'Primary voice realtime model (used for full profile in adaptive mode).',
suggestions: MODEL_SUGGESTIONS.openai,
},
{
path: 'models.voiceRealtimeSecondary',
label: 'Voice Realtime Secondary Model',
kind: 'string',
help: 'Secondary voice realtime model (used for lite profile in adaptive mode).',
suggestions: MODEL_SUGGESTIONS.openai,
},
{
path: 'models.voiceRouter',
label: 'Voice Router Model',
Expand Down Expand Up @@ -204,6 +226,16 @@ export const GUIDED_SECTIONS: GuidedSection[] = [
],
help: 'Realtime turn detection mode.',
},
{
path: 'knobs.voice.realtimeModelStrategy',
label: 'Realtime Model Strategy',
kind: 'enum',
options: [
{ label: 'Adaptive Profile', value: 'adaptive_profile' },
{ label: 'Fixed', value: 'fixed' },
],
help: 'Choose model by capability profile (`adaptive_profile`) or use fixed realtime model.',
},
{
path: 'knobs.voice.inputNoiseReduction',
label: 'Input Noise Reduction',
Expand Down Expand Up @@ -325,8 +357,11 @@ const RESTART_REQUIRED_PREFIXES = [
const NEXT_SESSION_PREFIXES = [
'knobs.conductor.roomConcurrency',
'models.voiceRealtime',
'models.voiceRealtimePrimary',
'models.voiceRealtimeSecondary',
'models.voiceRouter',
'models.voiceStt',
'knobs.voice.realtimeModelStrategy',
];

export const ALL_GUIDED_FIELDS = GUIDED_SECTIONS.flatMap((section) => section.fields);
Expand Down
23 changes: 21 additions & 2 deletions src/app/settings/models/reference/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ const modelRows: Row[] = [
rangeOrOptions: 'provider/model id',
notes: 'Realtime voice conversation model.',
},
{
path: 'models.voiceRealtimePrimary',
type: 'string',
rangeOrOptions: 'provider/model id',
notes: 'Primary realtime model (adaptive full-profile target).',
},
{
path: 'models.voiceRealtimeSecondary',
type: 'string',
rangeOrOptions: 'provider/model id',
notes: 'Secondary realtime model (adaptive lite-profile target).',
},
{
path: 'models.voiceRouter',
type: 'string',
Expand Down Expand Up @@ -143,6 +155,12 @@ const knobRows: Row[] = [
rangeOrOptions: 'provider/model id',
notes: 'Knob alias for realtime model.',
},
{
path: 'knobs.voice.realtimeModelStrategy',
type: 'enum',
rangeOrOptions: 'fixed | adaptive_profile',
notes: 'Realtime model selection strategy.',
},
{
path: 'knobs.voice.routerModel',
type: 'string',
Expand Down Expand Up @@ -274,8 +292,9 @@ const knobRows: Row[] = [
const coverageRows = [
{
runtime: 'Realtime voice conversation',
coverage: 'covered',
keys: 'models.voiceRealtime, knobs.voice.*',
coverage: 'partial (transport/model env-only)',
keys:
'models.voiceRealtime, models.voiceRealtimePrimary, models.voiceRealtimeSecondary, knobs.voice.*, VOICE_AGENT_MODEL_TRANSPORT env, VOICE_AGENT_RESPONSES_MODEL env',
},
{
runtime: 'Voice transcription',
Expand Down
8 changes: 4 additions & 4 deletions src/components/tool-dispatcher/hooks/useToolRunner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -932,14 +932,14 @@ export function useToolRunner(options: UseToolRunnerOptions): ToolRunnerApi {
typeof params?.intentId === 'string' && params.intentId.trim().length > 0
? params.intentId.trim()
: requestId;
const idempotencyKey =
typeof params?.idempotency_key === 'string' && params.idempotency_key.trim().length > 0
? params.idempotency_key.trim()
: `quick-${routeType}-${stableQuickHash(`${targetRoom}|${requestId}|${messageText}`)}`;
const participantId =
typeof params?.participant_id === 'string' && params.participant_id.trim().length > 0
? params.participant_id.trim()
: undefined;
const idempotencyKey =
typeof params?.idempotency_key === 'string' && params.idempotency_key.trim().length > 0
? params.idempotency_key.trim()
: `quick-${routeType}-${stableQuickHash(`${targetRoom}|${participantId || 'unknown'}|${requestId}|${messageText}`)}`;
const participantViewport =
participantId && participantViewportRef.current.has(participantId)
? participantViewportRef.current.get(participantId)?.viewport
Expand Down
65 changes: 65 additions & 0 deletions src/lib/agents/control-plane/resolver.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
jest.mock('./profiles', () => ({
getModelControlProfilesForResolution: jest.fn(async () => []),
}));

import { clearModelControlResolverCache, resolveModelControl } from './resolver';

const ENV_KEYS = [
'VOICE_AGENT_REALTIME_MODEL',
'VOICE_AGENT_REALTIME_MODEL_PRIMARY',
'VOICE_AGENT_REALTIME_MODEL_SECONDARY',
] as const;

describe('model control resolver env defaults', () => {
const originalEnv: Record<string, string | undefined> = {
VOICE_AGENT_REALTIME_MODEL: process.env.VOICE_AGENT_REALTIME_MODEL,
VOICE_AGENT_REALTIME_MODEL_PRIMARY: process.env.VOICE_AGENT_REALTIME_MODEL_PRIMARY,
VOICE_AGENT_REALTIME_MODEL_SECONDARY: process.env.VOICE_AGENT_REALTIME_MODEL_SECONDARY,
};

afterEach(() => {
clearModelControlResolverCache();
for (const key of ENV_KEYS) {
const value = originalEnv[key];
if (typeof value === 'undefined') {
delete process.env[key];
} else {
process.env[key] = value;
}
}
});

it('does not force an explicit voiceRealtime model when only primary/secondary defaults apply', async () => {
delete process.env.VOICE_AGENT_REALTIME_MODEL;
process.env.VOICE_AGENT_REALTIME_MODEL_PRIMARY = 'gpt-realtime-1.5';
process.env.VOICE_AGENT_REALTIME_MODEL_SECONDARY = 'gpt-realtime-mini';

const resolved = await resolveModelControl(
{
task: 'voice.realtime',
room: 'room-a',
includeUserScope: false,
},
{ skipCache: true },
);

expect(resolved.effective.models?.voiceRealtime).toBeUndefined();
expect(resolved.effective.models?.voiceRealtimePrimary).toBe('gpt-realtime-1.5');
expect(resolved.effective.models?.voiceRealtimeSecondary).toBe('gpt-realtime-mini');
});

it('preserves explicit voiceRealtime override when configured', async () => {
process.env.VOICE_AGENT_REALTIME_MODEL = 'gpt-realtime-1.5';

const resolved = await resolveModelControl(
{
task: 'voice.realtime',
room: 'room-a',
includeUserScope: false,
},
{ skipCache: true },
);

expect(resolved.effective.models?.voiceRealtime).toBe('gpt-realtime-1.5');
});
});
11 changes: 11 additions & 0 deletions src/lib/agents/control-plane/resolver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,11 @@ const APPLY_MODE_BY_PATH: Array<{ prefix: string; mode: ApplyMode }> = [
{ prefix: 'knobs.conductor.taskRetryJitterRatio', mode: 'restart_required' },
{ prefix: 'knobs.conductor.roomConcurrency', mode: 'next_session' },
{ prefix: 'models.voiceRealtime', mode: 'next_session' },
{ prefix: 'models.voiceRealtimePrimary', mode: 'next_session' },
{ prefix: 'models.voiceRealtimeSecondary', mode: 'next_session' },
{ prefix: 'models.voiceRouter', mode: 'next_session' },
{ prefix: 'models.voiceStt', mode: 'next_session' },
{ prefix: 'knobs.voice.realtimeModelStrategy', mode: 'next_session' },
];

const deepMerge = <T extends Record<string, unknown>>(target: T, patch: Record<string, unknown>): T => {
Expand Down Expand Up @@ -131,7 +134,10 @@ const envDefaults = (): ModelControlPatch => {
models: {
canvasSteward: process.env.CANVAS_STEWARD_MODEL,
voiceRouter: process.env.VOICE_AGENT_ROUTER_MODEL,
// Keep explicit override separate so adaptive primary/secondary selection remains effective by default.
voiceRealtime: process.env.VOICE_AGENT_REALTIME_MODEL,
voiceRealtimePrimary: process.env.VOICE_AGENT_REALTIME_MODEL_PRIMARY || 'gpt-realtime-1.5',
voiceRealtimeSecondary: process.env.VOICE_AGENT_REALTIME_MODEL_SECONDARY || 'gpt-realtime-mini',
voiceStt:
process.env.VOICE_AGENT_STT_MODEL ||
process.env.VOICE_AGENT_INPUT_TRANSCRIPTION_MODEL ||
Expand Down Expand Up @@ -166,6 +172,11 @@ const envDefaults = (): ModelControlPatch => {
: process.env.VOICE_AGENT_TRANSCRIPTION_ENABLED === 'false'
? false
: undefined,
realtimeModelStrategy:
process.env.VOICE_AGENT_REALTIME_MODEL_STRATEGY === 'fixed' ||
process.env.VOICE_AGENT_REALTIME_MODEL_STRATEGY === 'adaptive_profile'
? process.env.VOICE_AGENT_REALTIME_MODEL_STRATEGY
: 'adaptive_profile',
turnDetection:
process.env.VOICE_AGENT_TURN_DETECTION === 'none' ||
process.env.VOICE_AGENT_TURN_DETECTION === 'server_vad' ||
Expand Down
3 changes: 3 additions & 0 deletions src/lib/agents/control-plane/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export const voiceKnobPatchSchema = z
transcriptionEnabled: z.boolean().optional(),
sttModel: z.string().trim().min(1).max(120).optional(),
realtimeModel: z.string().trim().min(1).max(120).optional(),
realtimeModelStrategy: z.enum(['fixed', 'adaptive_profile']).optional(),
routerModel: z.string().trim().min(1).max(120).optional(),
turnDetection: z.enum(['none', 'server_vad', 'semantic_vad']).optional(),
inputNoiseReduction: z.enum(['none', 'near_field', 'far_field']).optional(),
Expand Down Expand Up @@ -79,6 +80,8 @@ export const modelControlModelsSchema = z
canvasSteward: z.string().trim().min(1).max(120).optional(),
voiceRouter: z.string().trim().min(1).max(120).optional(),
voiceRealtime: z.string().trim().min(1).max(120).optional(),
voiceRealtimePrimary: z.string().trim().min(1).max(120).optional(),
voiceRealtimeSecondary: z.string().trim().min(1).max(120).optional(),
voiceStt: z.string().trim().min(1).max(120).optional(),
searchModel: z.string().trim().min(1).max(120).optional(),
fastDefault: z.string().trim().min(1).max(120).optional(),
Expand Down
3 changes: 3 additions & 0 deletions src/lib/agents/control-plane/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ export type VoiceKnobPatch = {
transcriptionEnabled?: boolean;
sttModel?: string;
realtimeModel?: string;
realtimeModelStrategy?: 'fixed' | 'adaptive_profile';
routerModel?: string;
turnDetection?: 'none' | 'server_vad' | 'semantic_vad';
inputNoiseReduction?: 'none' | 'near_field' | 'far_field';
Expand Down Expand Up @@ -62,6 +63,8 @@ export type ModelControlPatch = {
canvasSteward?: string;
voiceRouter?: string;
voiceRealtime?: string;
voiceRealtimePrimary?: string;
voiceRealtimeSecondary?: string;
voiceStt?: string;
searchModel?: string;
fastDefault?: string;
Expand Down
4 changes: 4 additions & 0 deletions src/lib/agents/realtime/multi-participant-transcription.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export type LiveTranscriptionPayload = {
export type MultiParticipantTranscriptionOptions = {
room: Room;
maxParticipants: number;
realtimeModel?: string;
model: string;
language?: string;
inputAudioNoiseReduction?: RealtimeNoiseReductionOption | null;
Expand Down Expand Up @@ -61,6 +62,7 @@ const getSpeakerLabel = (participant: any): string => {
export class MultiParticipantTranscriptionManager {
private room: Room;
private maxParticipants: number;
private realtimeModel?: string;
private model: string;
private language?: string;
private inputAudioNoiseReduction?: RealtimeNoiseReductionOption | null;
Expand All @@ -76,6 +78,7 @@ export class MultiParticipantTranscriptionManager {
constructor(options: MultiParticipantTranscriptionOptions) {
this.room = options.room;
this.maxParticipants = Math.max(1, Math.floor(options.maxParticipants));
this.realtimeModel = options.realtimeModel;
this.model = options.model;
this.language = options.language;
this.inputAudioNoiseReduction = options.inputAudioNoiseReduction;
Expand Down Expand Up @@ -225,6 +228,7 @@ export class MultiParticipantTranscriptionManager {
});

const llm = new openaiRealtime.RealtimeModel({
...(this.realtimeModel ? { model: this.realtimeModel } : {}),
inputAudioTranscription: {
model: this.model,
...(this.language ? { language: this.language } : {}),
Expand Down
Loading
Loading