Support Realtime custom voice objects (#3473)

lionel-oai · web-flow · commit 9a92ea4c8e4c · 2026-05-26T17:39:40.000+09:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ requires-python = ">=3.10"
 license = "MIT"
 authors = [{ name = "OpenAI", email = "support@openai.com" }]
 dependencies = [
-    "openai>=2.26.0,<3",
+    "openai>=2.36.0,<3",
     "pydantic>=2.12.2, <3",
     "griffelib>=2, <3",
     "typing-extensions>=4.12.2, <5",
diff --git a/src/agents/realtime/config.py b/src/agents/realtime/config.py
@@ -46,6 +46,17 @@
 """The audio format for realtime audio streams."""
 
 
+class RealtimeCustomVoice(TypedDict):
+    """A custom Realtime voice object."""
+
+    id: str
+    """The custom voice ID."""
+
+
+RealtimeVoice: TypeAlias = str | RealtimeCustomVoice | Mapping[str, Any]
+"""The voice to use for realtime audio output."""
+
+
 RealtimeReasoningEffort: TypeAlias = Literal["minimal", "low", "medium", "high", "xhigh"] | str
 """The reasoning effort for realtime model responses."""
 
@@ -124,7 +135,7 @@ class RealtimeAudioOutputConfig(TypedDict, total=False):
     """Configuration for audio output in realtime sessions."""
 
     format: RealtimeAudioFormat | OpenAIRealtimeAudioFormats
-    voice: str
+    voice: RealtimeVoice
     speed: float
 
 
@@ -163,7 +174,7 @@ class RealtimeSessionModelSettings(TypedDict):
     audio: NotRequired[RealtimeAudioConfig]
     """The audio configuration for the session."""
 
-    voice: NotRequired[str]
+    voice: NotRequired[RealtimeVoice]
     """The voice to use for audio output."""
 
     speed: NotRequired[float]
diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py
@@ -370,6 +370,39 @@ def get_server_event_type_adapter() -> TypeAdapter[AllRealtimeServerEvents]:
     return ServerEventTypeAdapter
 
 
+_SERVER_EVENT_TYPES_WITH_CUSTOM_VOICE = frozenset(
+    {
+        "session.created",
+        "session.updated",
+        "response.created",
+        "response.done",
+    }
+)
+
+
+def _should_normalize_custom_voice_for_server_event(event: Any) -> bool:
+    return isinstance(event, dict) and event.get("type") in _SERVER_EVENT_TYPES_WITH_CUSTOM_VOICE
+
+
+def _normalize_custom_voice_for_server_event_validation(value: Any) -> Any:
+    # TODO: Remove this once generated Realtime server event models accept custom voice objects.
+    if isinstance(value, list):
+        return [_normalize_custom_voice_for_server_event_validation(item) for item in value]
+
+    if not isinstance(value, dict):
+        return value
+
+    normalized: dict[str, Any] = {}
+    for key, item in value.items():
+        if key == "voice" and isinstance(item, Mapping):
+            voice_id = item.get("id")
+            if isinstance(voice_id, str):
+                normalized[key] = voice_id
+                continue
+        normalized[key] = _normalize_custom_voice_for_server_event_validation(item)
+    return normalized
+
+
 async def _collect_enabled_handoffs(
     agent: RealtimeAgent[Any], context_wrapper: RunContextWrapper[Any]
 ) -> list[Handoff[Any, RealtimeAgent[Any]]]:
@@ -1054,7 +1087,14 @@ async def _handle_ws_event(self, event: dict[str, Any]):
         try:
             if "previous_item_id" in event and event["previous_item_id"] is None:
                 event["previous_item_id"] = ""  # TODO (rm) remove
-            parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(event)
+            validation_event = (
+                _normalize_custom_voice_for_server_event_validation(event)
+                if _should_normalize_custom_voice_for_server_event(event)
+                else event
+            )
+            parsed: AllRealtimeServerEvents = self._server_event_type_adapter.validate_python(
+                validation_event
+            )
         except pydantic.ValidationError as e:
             logger.error(f"Failed to validate server event: {event}", exc_info=True)
             await self._emit_event(RealtimeModelErrorEvent(error=e))
diff --git a/tests/realtime/test_openai_realtime.py b/tests/realtime/test_openai_realtime.py
@@ -7,6 +7,7 @@
 
 import pytest
 import websockets
+from pydantic import TypeAdapter
 
 from agents import Agent, function_tool
 from agents.exceptions import UserError
@@ -445,6 +446,80 @@ async def test_handle_invalid_event_schema_logs_error(self, model):
         error_event = mock_listener.on_event.call_args_list[1][0][0]
         assert error_event.type == "error"
 
+    @pytest.mark.asyncio
+    async def test_custom_voice_response_events_update_response_sequencer(self, model, monkeypatch):
+        """Dict-shaped custom voices should not block response.create sequencing."""
+        payload_types: list[str] = []
+
+        async def fake_send_raw(event):
+            payload_types.append(event.type)
+
+        class CustomVoiceRejectingAdapter:
+            _string_adapter = TypeAdapter(str)
+
+            def validate_python(self, event):
+                voice = event.get("response", {}).get("audio", {}).get("output", {}).get("voice")
+                if isinstance(voice, dict):
+                    self._string_adapter.validate_python(voice)
+                return SimpleNamespace(type=event["type"])
+
+        monkeypatch.setattr(model, "_send_raw_message", fake_send_raw)
+        model._server_event_type_adapter = CustomVoiceRejectingAdapter()
+        mock_listener = AsyncMock()
+        model.add_listener(mock_listener)
+
+        await model._send_user_input(RealtimeModelSendUserInput(user_input="hi"))
+        await asyncio.sleep(0)
+
+        assert payload_types == ["conversation.item.create", "response.create"]
+        assert model._response_control == "create_requested"
+
+        response_with_custom_voice = {
+            "type": "response.created",
+            "response": {"audio": {"output": {"voice": {"id": "voice_test"}}}},
+        }
+        await model._handle_ws_event(response_with_custom_voice)
+
+        assert model._ongoing_response is True
+        assert model._response_control == "free"
+
+        await model._handle_ws_event(
+            {
+                "type": "response.done",
+                "response": {"audio": {"output": {"voice": {"id": "voice_test"}}}},
+            }
+        )
+
+        assert model._ongoing_response is False
+        assert model._response_control == "free"
+        raw_event = mock_listener.on_event.call_args_list[0][0][0]
+        assert raw_event.data is response_with_custom_voice
+        assert response_with_custom_voice["response"]["audio"]["output"]["voice"] == {
+            "id": "voice_test"
+        }
+
+        await model._send_tool_output(
+            RealtimeModelSendToolOutput(
+                tool_call=SimpleNamespace(
+                    id="item_1",
+                    previous_item_id=None,
+                    call_id="call_1",
+                    arguments="{}",
+                    name="lookup",
+                ),
+                output="ok",
+                start_response=True,
+            )
+        )
+        await asyncio.sleep(0)
+
+        assert payload_types == [
+            "conversation.item.create",
+            "response.create",
+            "conversation.item.create",
+            "response.create",
+        ]
+
     @pytest.mark.asyncio
     async def test_handle_unknown_event_type_ignored(self, model):
         """Test that unknown event types are ignored gracefully."""
@@ -501,6 +576,35 @@ async def test_handle_audio_delta_event_success(self, model):
         assert audio_state is not None
         assert audio_state.audio_length_ms > 0  # Should have some audio length
 
+    @pytest.mark.asyncio
+    async def test_audio_delta_event_skips_custom_voice_normalization(self, model, monkeypatch):
+        """High-frequency audio delta events should not pay for custom voice normalization."""
+        mock_listener = AsyncMock()
+        model.add_listener(mock_listener)
+        model._audio_state_tracker.set_audio_format("pcm16")
+
+        def fail_normalize(event):
+            raise AssertionError("custom voice normalization should not run")
+
+        monkeypatch.setattr(
+            "agents.realtime.openai_realtime._normalize_custom_voice_for_server_event_validation",
+            fail_normalize,
+        )
+
+        await model._handle_ws_event(
+            {
+                "type": "response.output_audio.delta",
+                "event_id": "event_123",
+                "response_id": "resp_123",
+                "item_id": "item_456",
+                "output_index": 0,
+                "content_index": 0,
+                "delta": "dGVzdCBhdWRpbw==",
+            }
+        )
+
+        assert mock_listener.on_event.call_count == 2
+
     @pytest.mark.asyncio
     async def test_backward_compat_output_item_added_and_done(self, model):
         """response.output_item.added/done paths emit item updates."""
@@ -1519,6 +1623,22 @@ def test_get_and_update_session_config(self, model):
         assert cfg.audio is not None and cfg.audio.output is not None
         assert cfg.audio.output.voice == "verse"
 
+    def test_session_config_accepts_custom_voice_object(self, model):
+        custom_voice = {"id": "voice_test"}
+
+        cfg = model._get_session_config({"voice": custom_voice})
+        payload = cfg.model_dump(exclude_unset=True)
+
+        assert payload["audio"]["output"]["voice"] == custom_voice
+
+    def test_session_config_accepts_nested_custom_voice_object(self, model):
+        custom_voice = {"id": "voice_test"}
+
+        cfg = model._get_session_config({"audio": {"output": {"voice": custom_voice}}})
+        payload = cfg.model_dump(exclude_unset=True)
+
+        assert payload["audio"]["output"]["voice"] == custom_voice
+
     def test_session_config_defaults_audio_formats_when_not_call(self, model):
         settings: dict[str, Any] = {}
         cfg = model._get_session_config(settings)
diff --git a/tests/realtime/test_session.py b/tests/realtime/test_session.py
@@ -1386,6 +1386,42 @@ async def test_handoff_tool_handling(self, mock_model):
         # Verify agent was updated
         assert session._current_agent == second_agent
 
+    @pytest.mark.asyncio
+    async def test_handoff_session_update_preserves_custom_voice(self, mock_model):
+        custom_voice = {"id": "voice_test"}
+        first_agent = RealtimeAgent(
+            name="first_agent",
+            instructions="first_agent_instructions",
+            tools=[],
+            handoffs=[],
+        )
+        second_agent = RealtimeAgent(
+            name="second_agent",
+            instructions="second_agent_instructions",
+            tools=[],
+            handoffs=[],
+        )
+        first_agent.handoffs = [second_agent]
+        session = RealtimeSession(
+            mock_model,
+            first_agent,
+            None,
+            model_config={"initial_model_settings": {"voice": custom_voice}},
+        )
+
+        await session._handle_tool_call(
+            RealtimeModelToolCallEvent(
+                name=Handoff.default_tool_name(second_agent),
+                call_id="call_789",
+                arguments="{}",
+            )
+        )
+
+        session_update_event = mock_model.sent_events[0]
+        assert isinstance(session_update_event, RealtimeModelSendSessionUpdate)
+        assert session_update_event.session_settings["voice"] == custom_voice
+        assert mock_model.sent_events[1].start_response is True
+
     @pytest.mark.asyncio
     async def test_unknown_tool_handling(self, mock_model, mock_agent, mock_function_tool):
         """Test that unknown tools complete the model call without starting a response."""
diff --git a/uv.lock b/uv.lock