wip

m1guelpf · m1guelpf · commit 3b7ddc7154f9 · 2025-09-10T15:39:34.000+01:00
diff --git a/Sources/Core/Models/Item.swift b/Sources/Core/Models/Item.swift
@@ -8,18 +8,18 @@ import MetaCodable
 
 	public struct Audio: Equatable, Hashable, Codable, Sendable {
 		/// Audio bytes
-		public var audio: AudioData
+		public var audio: AudioData?
 
 		/// The transcript of the audio
 		public var transcript: String?
 
-		public init(audio: AudioData, transcript: String? = nil) {
+		public init(audio: AudioData? = nil, transcript: String? = nil) {
 			self.audio = audio
 			self.transcript = transcript
 		}
 
-		public init(audio: Data = Data(), transcript: String? = nil) {
-			self.init(audio: AudioData(data: audio), transcript: transcript)
+		public init(audio: Data? = nil, transcript: String? = nil) {
+			self.init(audio: audio.map { AudioData(data: $0) }, transcript: transcript)
 		}
 	}
 
@@ -36,15 +36,15 @@ import MetaCodable
 		public enum Content: Equatable, Hashable, Sendable {
 			case text(String)
 			case audio(Audio)
-			case input_text(String)
-			case input_audio(Audio)
+			case inputText(String)
+			case inputAudio(Audio)
 
 			public var text: String? {
 				switch self {
 					case let .text(text): text
-					case let .input_text(text): text
+					case let .inputText(text): text
 					case let .audio(audio): audio.transcript
-					case let .input_audio(audio): audio.transcript
+					case let .inputAudio(audio): audio.transcript
 				}
 			}
 		}
@@ -439,11 +439,11 @@ extension Item.Message.Content: Codable {
 				self = try .text(container.decode(String.self, forKey: .text))
 			case "input_text":
 				let container = try decoder.container(keyedBy: Text.CodingKeys.self)
-				self = try .input_text(container.decode(String.self, forKey: .text))
-			case "audio":
+				self = try .inputText(container.decode(String.self, forKey: .text))
+			case "output_audio":
 				self = try .audio(Item.Audio(from: decoder))
 			case "input_audio":
-				self = try .input_audio(Item.Audio(from: decoder))
+				self = try .inputAudio(Item.Audio(from: decoder))
 			default:
 				throw DecodingError.dataCorruptedError(forKey: .type, in: container, debugDescription: "Unknown content type: \(type)")
 		}
@@ -456,14 +456,14 @@ extension Item.Message.Content: Codable {
 			case let .text(text):
 				try container.encode(text, forKey: .text)
 				try container.encode("text", forKey: .type)
-			case let .input_text(text):
+			case let .inputText(text):
 				try container.encode(text, forKey: .text)
 				try container.encode("input_text", forKey: .type)
 			case let .audio(audio):
-				try container.encode("audio", forKey: .type)
+				try container.encode("output_audio", forKey: .type)
 				try container.encode(audio.audio, forKey: .audio)
 				try container.encode(audio.transcript, forKey: .transcript)
-			case let .input_audio(audio):
+			case let .inputAudio(audio):
 				try container.encode(audio.audio, forKey: .audio)
 				try container.encode("input_audio", forKey: .type)
 				try container.encode(audio.transcript, forKey: .transcript)
diff --git a/Sources/Core/Models/ServerEvent.swift b/Sources/Core/Models/ServerEvent.swift
@@ -46,6 +46,14 @@ import MetaCodable
 	@CodedAs("conversation.item.created")
 	case conversationItemCreated(eventId: String, item: Item, previousItemId: String?)
 
+	/// Returned when a conversation item is added.
+	///
+	/// - Parameter eventId: The unique ID of the server event.
+	/// - Parameter item: A single item within a Realtime conversation.
+	/// - Parameter previousItemId: The ID of the item that precedes this one, if any.
+	@CodedAs("conversation.item.added")
+	case conversationItemAdded(eventId: String, item: Item, previousItemId: String?)
+
 	/// Returned when a conversation item is finalized.
 	/// - Parameter eventId: The unique ID of the server event.
 	/// - Parameter item: A single item within a Realtime conversation.
@@ -202,6 +210,20 @@ import MetaCodable
 	@CodedAs("input_audio_buffer.timeout_triggered")
 	case inputAudioBufferTimeoutTriggered(eventId: String, itemId: String, audioStartMs: Int, audioEndMs: Int)
 
+	/// Returned when the output audio buffer starts playing audio.
+	///
+	/// - Parameter eventId: The unique ID of the server event.
+	/// - Parameter responseId: The ID of the Response to which the output audio belongs.
+	@CodedAs("output_audio_buffer.started")
+	case outputAudioBufferStarted(eventId: String, responseId: String)
+
+	/// Returned when the output audio buffer stops playing audio.
+	///
+	/// - Parameter eventId: The unique ID of the server event.
+	/// - Parameter responseId: The ID of the Response to which the output audio belongs.
+	@CodedAs("output_audio_buffer.stopped")
+	case outputAudioBufferStopped(eventId: String, responseId: String)
+
 	/// Returned when a new Response is created.
 	///
 	/// The first event of response creation, where the response is in an initial state of `inProgress`.
@@ -318,7 +340,7 @@ import MetaCodable
 	/// - Parameter outputIndex: The index of the output item in the Response.
 	/// - Parameter contentIndex: The index of the content part in the item's content array.
 	/// - Parameter delta: The transcript delta.
-	@CodedAs("response.audio_transcript.delta")
+	@CodedAs("response.output_audio_transcript.delta")
 	case responseAudioTranscriptDelta(
 		eventId: String,
 		responseId: String,
@@ -336,7 +358,7 @@ import MetaCodable
 	/// - Parameter outputIndex: The index of the output item in the Response.
 	/// - Parameter contentIndex: The index of the content part in the item's content array.
 	/// - Parameter transcript: The final transcript of the audio.
-	@CodedAs("response.audio_transcript.done")
+	@CodedAs("response.output_audio_transcript.done")
 	case responseAudioTranscriptDone(
 		eventId: String,
 		responseId: String,
@@ -509,6 +531,7 @@ extension ServerEvent: Identifiable {
 			case let .error(id, _): id
 			case let .sessionCreated(id, _): id
 			case let .sessionUpdated(id, _): id
+			case let .conversationItemAdded(id, _, _): id
 			case let .conversationItemCreated(id, _, _): id
 			case let .conversationItemDone(id, _, _): id
 			case let .conversationItemRetrieved(id, _): id
@@ -523,6 +546,8 @@ extension ServerEvent: Identifiable {
 			case let .inputAudioBufferSpeechStarted(id, _, _): id
 			case let .inputAudioBufferSpeechStopped(id, _, _): id
 			case let .inputAudioBufferTimeoutTriggered(id, _, _, _): id
+			case let .outputAudioBufferStarted(id, _): id
+			case let .outputAudioBufferStopped(id, _): id
 			case let .responseCreated(id, _): id
 			case let .responseDone(id, _): id
 			case let .responseOutputItemAdded(id, _, _, _): id
diff --git a/Sources/UI/Conversation.swift b/Sources/UI/Conversation.swift
@@ -5,25 +5,35 @@ import Foundation
 
 public enum ConversationError: Error {
 	case sessionNotFound
+	case invalidEphemeralKey
 	case converterInitializationFailed
 }
 
 @MainActor @Observable
 public final class Conversation: @unchecked Sendable {
 	public typealias SessionUpdateCallback = (inout Session) -> Void
 
-	public var debug: Bool
 	private let client: WebRTCConnector
 	private var task: Task<Void, Error>!
 	private let sessionUpdateCallback: SessionUpdateCallback?
 	private let errorStream: AsyncStream<ServerError>.Continuation
 
-	/// A stream of errors that occur during the conversation.
-	public let errors: AsyncStream<ServerError>
+	/// Whether to print debug information to the console.
+	public var debug: Bool
+
+	/// Whether to mute the user's microphone.
+	public var muted: Bool = false {
+		didSet {
+			client.audioTrack.isEnabled = !muted
+		}
+	}
 
 	/// The unique ID of the conversation.
 	public private(set) var id: String?
 
+	/// A stream of errors that occur during the conversation.
+	public let errors: AsyncStream<ServerError>
+
 	/// The current session for this conversation.
 	public private(set) var session: Session?
 
@@ -38,6 +48,9 @@ public final class Conversation: @unchecked Sendable {
 	/// This only works when using the server's voice detection.
 	public private(set) var isUserSpeaking: Bool = false
 
+	/// Whether the model is currently speaking.
+	public private(set) var isModelSpeaking: Bool = false
+
 	/// A list of messages in the conversation.
 	/// Note that this doesn't include function call events. To get a complete list, use `entries`.
 	public var messages: [Item.Message] {
@@ -47,9 +60,9 @@ public final class Conversation: @unchecked Sendable {
 		} }
 	}
 
-	public required init(debug: Bool = false, configuring sessionUpdateCallback: SessionUpdateCallback? = nil) throws {
+	public required init(debug: Bool = false, configuring sessionUpdateCallback: SessionUpdateCallback? = nil) {
 		self.debug = debug
-		client = try WebRTCConnector.create()
+		client = try! WebRTCConnector.create()
 		self.sessionUpdateCallback = sessionUpdateCallback
 		(errors, errorStream) = AsyncStream.makeStream(of: ServerError.self)
 
@@ -69,21 +82,23 @@ public final class Conversation: @unchecked Sendable {
 		}
 	}
 
+	deinit {
+		client.disconnect()
+		errorStream.finish()
+	}
+
 	public func connect(using request: URLRequest) async throws {
 		await AVAudioApplication.requestRecordPermission()
 
 		try await client.connect(using: request)
 	}
 
 	public func connect(ephemeralKey: String, model: Model = .gptRealtime) async throws {
-		try await connect(using: .webRTCConnectionRequest(ephemeralKey: ephemeralKey, model: model))
-	}
-
-	deinit {
-		errorStream.finish()
-
-		Task { @MainActor [weak self] in
-			self?.task?.cancel()
+		do {
+			try await connect(using: .webRTCConnectionRequest(ephemeralKey: ephemeralKey, model: model))
+		} catch let error as WebRTCConnector.WebRTCError {
+			guard case .invalidEphemeralKey = error else { throw error }
+			throw ConversationError.invalidEphemeralKey
 		}
 	}
 
@@ -137,7 +152,7 @@ public final class Conversation: @unchecked Sendable {
 	/// Send a text message and wait for a response.
 	/// Optionally, you can provide a response configuration to customize the model's behavior.
 	public func send(from role: Item.Message.Role, text: String, response: Response.Config? = nil) throws {
-		try send(event: .createConversationItem(.message(Item.Message(id: String(randomLength: 32), role: role, content: [.input_text(text)]))))
+		try send(event: .createConversationItem(.message(Item.Message(id: String(randomLength: 32), role: role, content: [.inputText(text)]))))
 		try send(event: .createResponse(using: response))
 	}
 
@@ -167,9 +182,9 @@ private extension Conversation {
 				entries.removeAll { $0.id == itemId }
 			case let .conversationItemInputAudioTranscriptionCompleted(_, itemId, contentIndex, transcript, _, _):
 				updateEvent(id: itemId) { message in
-					guard case let .input_audio(audio) = message.content[contentIndex] else { return }
+					guard case let .inputAudio(audio) = message.content[contentIndex] else { return }
 
-					message.content[contentIndex] = .input_audio(.init(audio: audio.audio, transcript: transcript))
+					message.content[contentIndex] = .inputAudio(.init(audio: audio.audio, transcript: transcript))
 				}
 			case let .conversationItemInputAudioTranscriptionFailed(_, _, _, error):
 				errorStream.yield(error)
@@ -211,7 +226,7 @@ private extension Conversation {
 			case let .responseOutputAudioDelta(_, _, itemId, _, contentIndex, delta):
 				updateEvent(id: itemId) { message in
 					guard case let .audio(audio) = message.content[contentIndex] else { return }
-					message.content[contentIndex] = .audio(.init(audio: audio.audio.data + delta.data, transcript: audio.transcript))
+					message.content[contentIndex] = .audio(.init(audio: (audio.audio?.data ?? Data()) + delta.data, transcript: audio.transcript))
 				}
 			case let .responseFunctionCallArgumentsDelta(_, _, itemId, _, _, delta):
 				updateEvent(id: itemId) { functionCall in
@@ -225,6 +240,10 @@ private extension Conversation {
 				isUserSpeaking = true
 			case .inputAudioBufferSpeechStopped:
 				isUserSpeaking = false
+			case .outputAudioBufferStarted:
+				isModelSpeaking = true
+			case .outputAudioBufferStopped:
+				isModelSpeaking = false
 			case let .responseOutputItemDone(_, _, _, item):
 				updateEvent(id: item.id) { message in
 					guard case let .message(newMessage) = item else { return }
diff --git a/Sources/WebRTC/WebRTCConnector.swift b/Sources/WebRTC/WebRTCConnector.swift
@@ -7,7 +7,8 @@ import FoundationNetworking
 #endif
 
 @Observable public final class WebRTCConnector: NSObject, Connector, Sendable {
-	enum WebRTCError: Error {
+	public enum WebRTCError: Error {
+		case invalidEphemeralKey
 		case missingAudioPermission
 		case failedToCreateDataChannel
 		case failedToCreatePeerConnection
@@ -24,7 +25,7 @@ import FoundationNetworking
 		!audioTrack.isEnabled
 	}
 
-	private let audioTrack: LKRTCAudioTrack
+	package let audioTrack: LKRTCAudioTrack
 	private let dataChannel: LKRTCDataChannel
 	private let connection: LKRTCPeerConnection
 
@@ -82,7 +83,6 @@ import FoundationNetworking
 	public func disconnect() {
 		connection.close()
 		stream.finish()
-		Task { @MainActor in status = .disconnected }
 	}
 
 	public func toggleMute() {
@@ -166,7 +166,9 @@ private extension WebRTCConnector {
 		request.setValue("application/sdp", forHTTPHeaderField: "Content-Type")
 
 		let (data, response) = try await URLSession.shared.data(for: request)
-		guard let httpResponse = response as? HTTPURLResponse, (200...299).contains(httpResponse.statusCode), let remoteSdp = String(data: data, encoding: .utf8) else {
+
+		guard let response = response as? HTTPURLResponse, response.statusCode == 201, let remoteSdp = String(data: data, encoding: .utf8) else {
+			if (response as? HTTPURLResponse)?.statusCode == 401 { throw WebRTCError.invalidEphemeralKey }
 			throw WebRTCError.badServerResponse(response)
 		}
 
@@ -191,7 +193,11 @@ extension WebRTCConnector: LKRTCPeerConnectionDelegate {
 
 extension WebRTCConnector: LKRTCDataChannelDelegate {
 	public func dataChannel(_: LKRTCDataChannel, didReceiveMessageWith buffer: LKRTCDataBuffer) {
-		stream.yield(with: Result { try self.decoder.decode(ServerEvent.self, from: buffer.data) })
+		do { try stream.yield(decoder.decode(ServerEvent.self, from: buffer.data)) }
+		catch {
+			print("Failed to decode server event: \(String(data: buffer.data, encoding: .utf8) ?? "<invalid utf8>")")
+			stream.finish(throwing: error)
+		}
 	}
 
 	public func dataChannelDidChangeState(_ dataChannel: LKRTCDataChannel) {