Skip to content

Commit 9a92ea4

Browse files
authored
Support Realtime custom voice objects (#3473)
1 parent fedc809 commit 9a92ea4

6 files changed

Lines changed: 215 additions & 8 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ requires-python = ">=3.10"
77
license = "MIT"
88
authors = [{ name = "OpenAI", email = "support@openai.com" }]
99
dependencies = [
10-
"openai>=2.26.0,<3",
10+
"openai>=2.36.0,<3",
1111
"pydantic>=2.12.2, <3",
1212
"griffelib>=2, <3",
1313
"typing-extensions>=4.12.2, <5",

src/agents/realtime/config.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,17 @@
4646
"""The audio format for realtime audio streams."""
4747

4848

49+
class RealtimeCustomVoice(TypedDict):
50+
"""A custom Realtime voice object."""
51+
52+
id: str
53+
"""The custom voice ID."""
54+
55+
56+
RealtimeVoice: TypeAlias = str | RealtimeCustomVoice | Mapping[str, Any]
57+
"""The voice to use for realtime audio output."""
58+
59+
4960
RealtimeReasoningEffort: TypeAlias = Literal["minimal", "low", "medium", "high", "xhigh"] | str
5061
"""The reasoning effort for realtime model responses."""
5162

@@ -124,7 +135,7 @@ class RealtimeAudioOutputConfig(TypedDict, total=False):
124135
"""Configuration for audio output in realtime sessions."""
125136

126137
format: RealtimeAudioFormat | OpenAIRealtimeAudioFormats
127-
voice: str
138+
voice: RealtimeVoice
128139
speed: float
129140

130141

@@ -163,7 +174,7 @@ class RealtimeSessionModelSettings(TypedDict):
163174
audio: NotRequired[RealtimeAudioConfig]
164175
"""The audio configuration for the session."""
165176

166-
voice: NotRequired[str]
177+
voice: NotRequired[RealtimeVoice]
167178
"""The voice to use for audio output."""
168179

169180
speed: NotRequired[float]

src/agents/realtime/openai_realtime.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,39 @@ def get_server_event_type_adapter() -> TypeAdapter[AllRealtimeServerEvents]:
370370
return ServerEventTypeAdapter
371371

372372

373+
_SERVER_EVENT_TYPES_WITH_CUSTOM_VOICE = frozenset(
374+
{
375+
"session.created",
376+
"session.updated",
377+
"response.created",
378+
"response.done",
379+
}
380+
)
381+
382+
383+
def _should_normalize_custom_voice_for_server_event(event: Any) -> bool:
384+
return isinstance(event, dict) and event.get("type") in _SERVER_EVENT_TYPES_WITH_CUSTOM_VOICE
385+
386+
387+
def _normalize_custom_voice_for_server_event_validation(value: Any) -> Any:
388+
# TODO: Remove this once generated Realtime server event models accept custom voice objects.
389+
if isinstance(value, list):
390+
return [_normalize_custom_voice_for_server_event_validation(item) for item in value]
391+
392+
if not isinstance(value, dict):
393+
return value
394+
395+
normalized: dict[str, Any] = {}
396+
for key, item in value.items():
397+
if key == "voice" and isinstance(item, Mapping):
398+
voice_id = item.get("id")
399+
if isinstance(voice_id, str):
400+
normalized[key] = voice_id
401+
continue
402+
normalized[key] = _normalize_custom_voice_for_server_event_validation(item)
403+
return normalized
404+
405+
373406
async def _collect_enabled_handoffs(
374407
agent: RealtimeAgent[Any], context_wrapper: RunContextWrapper[Any]
375408
) -> list[Handoff[Any, RealtimeAgent[Any]]]:
@@ -1054,7 +1087,14 @@ async def _handle_ws_event(self, event: dict[str, Any]):
10541087
try:
10551088
if "previous_item_id" in event and event["previous_item_id"] is None:
10561089
event["previous_item_id"] = "" # TODO (rm) remove
1057-
parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(event)
1090+
validation_event = (
1091+
_normalize_custom_voice_for_server_event_validation(event)
1092+
if _should_normalize_custom_voice_for_server_event(event)
1093+
else event
1094+
)
1095+
parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(
1096+
validation_event
1097+
)
10581098
except pydantic.ValidationError as e:
10591099
logger.error(f"Failed to validate server event: {event}", exc_info=True)
10601100
await self._emit_event(RealtimeModelErrorEvent(error=e))

tests/realtime/test_openai_realtime.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pytest
99
import websockets
10+
from pydantic import TypeAdapter
1011

1112
from agents import Agent, function_tool
1213
from agents.exceptions import UserError
@@ -445,6 +446,80 @@ async def test_handle_invalid_event_schema_logs_error(self, model):
445446
error_event = mock_listener.on_event.call_args_list[1][0][0]
446447
assert error_event.type == "error"
447448

449+
@pytest.mark.asyncio
450+
async def test_custom_voice_response_events_update_response_sequencer(self, model, monkeypatch):
451+
"""Dict-shaped custom voices should not block response.create sequencing."""
452+
payload_types: list[str] = []
453+
454+
async def fake_send_raw(event):
455+
payload_types.append(event.type)
456+
457+
class CustomVoiceRejectingAdapter:
458+
_string_adapter = TypeAdapter(str)
459+
460+
def validate_python(self, event):
461+
voice = event.get("response", {}).get("audio", {}).get("output", {}).get("voice")
462+
if isinstance(voice, dict):
463+
self._string_adapter.validate_python(voice)
464+
return SimpleNamespace(type=event["type"])
465+
466+
monkeypatch.setattr(model, "_send_raw_message", fake_send_raw)
467+
model._server_event_type_adapter = CustomVoiceRejectingAdapter()
468+
mock_listener = AsyncMock()
469+
model.add_listener(mock_listener)
470+
471+
await model._send_user_input(RealtimeModelSendUserInput(user_input="hi"))
472+
await asyncio.sleep(0)
473+
474+
assert payload_types == ["conversation.item.create", "response.create"]
475+
assert model._response_control == "create_requested"
476+
477+
response_with_custom_voice = {
478+
"type": "response.created",
479+
"response": {"audio": {"output": {"voice": {"id": "voice_test"}}}},
480+
}
481+
await model._handle_ws_event(response_with_custom_voice)
482+
483+
assert model._ongoing_response is True
484+
assert model._response_control == "free"
485+
486+
await model._handle_ws_event(
487+
{
488+
"type": "response.done",
489+
"response": {"audio": {"output": {"voice": {"id": "voice_test"}}}},
490+
}
491+
)
492+
493+
assert model._ongoing_response is False
494+
assert model._response_control == "free"
495+
raw_event = mock_listener.on_event.call_args_list[0][0][0]
496+
assert raw_event.data is response_with_custom_voice
497+
assert response_with_custom_voice["response"]["audio"]["output"]["voice"] == {
498+
"id": "voice_test"
499+
}
500+
501+
await model._send_tool_output(
502+
RealtimeModelSendToolOutput(
503+
tool_call=SimpleNamespace(
504+
id="item_1",
505+
previous_item_id=None,
506+
call_id="call_1",
507+
arguments="{}",
508+
name="lookup",
509+
),
510+
output="ok",
511+
start_response=True,
512+
)
513+
)
514+
await asyncio.sleep(0)
515+
516+
assert payload_types == [
517+
"conversation.item.create",
518+
"response.create",
519+
"conversation.item.create",
520+
"response.create",
521+
]
522+
448523
@pytest.mark.asyncio
449524
async def test_handle_unknown_event_type_ignored(self, model):
450525
"""Test that unknown event types are ignored gracefully."""
@@ -501,6 +576,35 @@ async def test_handle_audio_delta_event_success(self, model):
501576
assert audio_state is not None
502577
assert audio_state.audio_length_ms > 0 # Should have some audio length
503578

579+
@pytest.mark.asyncio
580+
async def test_audio_delta_event_skips_custom_voice_normalization(self, model, monkeypatch):
581+
"""High-frequency audio delta events should not pay for custom voice normalization."""
582+
mock_listener = AsyncMock()
583+
model.add_listener(mock_listener)
584+
model._audio_state_tracker.set_audio_format("pcm16")
585+
586+
def fail_normalize(event):
587+
raise AssertionError("custom voice normalization should not run")
588+
589+
monkeypatch.setattr(
590+
"agents.realtime.openai_realtime._normalize_custom_voice_for_server_event_validation",
591+
fail_normalize,
592+
)
593+
594+
await model._handle_ws_event(
595+
{
596+
"type": "response.output_audio.delta",
597+
"event_id": "event_123",
598+
"response_id": "resp_123",
599+
"item_id": "item_456",
600+
"output_index": 0,
601+
"content_index": 0,
602+
"delta": "dGVzdCBhdWRpbw==",
603+
}
604+
)
605+
606+
assert mock_listener.on_event.call_count == 2
607+
504608
@pytest.mark.asyncio
505609
async def test_backward_compat_output_item_added_and_done(self, model):
506610
"""response.output_item.added/done paths emit item updates."""
@@ -1519,6 +1623,22 @@ def test_get_and_update_session_config(self, model):
15191623
assert cfg.audio is not None and cfg.audio.output is not None
15201624
assert cfg.audio.output.voice == "verse"
15211625

1626+
def test_session_config_accepts_custom_voice_object(self, model):
1627+
custom_voice = {"id": "voice_test"}
1628+
1629+
cfg = model._get_session_config({"voice": custom_voice})
1630+
payload = cfg.model_dump(exclude_unset=True)
1631+
1632+
assert payload["audio"]["output"]["voice"] == custom_voice
1633+
1634+
def test_session_config_accepts_nested_custom_voice_object(self, model):
1635+
custom_voice = {"id": "voice_test"}
1636+
1637+
cfg = model._get_session_config({"audio": {"output": {"voice": custom_voice}}})
1638+
payload = cfg.model_dump(exclude_unset=True)
1639+
1640+
assert payload["audio"]["output"]["voice"] == custom_voice
1641+
15221642
def test_session_config_defaults_audio_formats_when_not_call(self, model):
15231643
settings: dict[str, Any] = {}
15241644
cfg = model._get_session_config(settings)

tests/realtime/test_session.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,42 @@ async def test_handoff_tool_handling(self, mock_model):
13861386
# Verify agent was updated
13871387
assert session._current_agent == second_agent
13881388

1389+
@pytest.mark.asyncio
1390+
async def test_handoff_session_update_preserves_custom_voice(self, mock_model):
1391+
custom_voice = {"id": "voice_test"}
1392+
first_agent = RealtimeAgent(
1393+
name="first_agent",
1394+
instructions="first_agent_instructions",
1395+
tools=[],
1396+
handoffs=[],
1397+
)
1398+
second_agent = RealtimeAgent(
1399+
name="second_agent",
1400+
instructions="second_agent_instructions",
1401+
tools=[],
1402+
handoffs=[],
1403+
)
1404+
first_agent.handoffs = [second_agent]
1405+
session = RealtimeSession(
1406+
mock_model,
1407+
first_agent,
1408+
None,
1409+
model_config={"initial_model_settings": {"voice": custom_voice}},
1410+
)
1411+
1412+
await session._handle_tool_call(
1413+
RealtimeModelToolCallEvent(
1414+
name=Handoff.default_tool_name(second_agent),
1415+
call_id="call_789",
1416+
arguments="{}",
1417+
)
1418+
)
1419+
1420+
session_update_event = mock_model.sent_events[0]
1421+
assert isinstance(session_update_event, RealtimeModelSendSessionUpdate)
1422+
assert session_update_event.session_settings["voice"] == custom_voice
1423+
assert mock_model.sent_events[1].start_response is True
1424+
13891425
@pytest.mark.asyncio
13901426
async def test_unknown_tool_handling(self, mock_model, mock_agent, mock_function_tool):
13911427
"""Test that unknown tools complete the model call without starting a response."""

uv.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)