Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion webapp/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ TWILIO_ACCOUNT_SID=""
TWILIO_AUTH_TOKEN=""

# Optional - if using remote backend
REMOTE_BACKEND=""
REMOTE_BACKEND=""

OPENAI_API_KEY=""
26 changes: 26 additions & 0 deletions webapp/app/api/session/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
export async function GET() {
try {
const r = await fetch("https://api.openai.com/v1/realtime/sessions", {
method: "POST",
headers: {
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model: "gpt-4o-realtime-preview-2025-06-03",
voice: "verse",
}),
});

if (!r.ok) {
const error = await r.text();
return Response.json({ error }, { status: 500 });
}

const data = await r.json();
return Response.json(data);
} catch (error) {
console.error("Error creating realtime session:", error);
return Response.json({ error: "Internal server error" }, { status: 500 });
}
}
16 changes: 15 additions & 1 deletion webapp/components/call-interface.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import React, { useState, useEffect } from "react";
import TopBar from "@/components/top-bar";
import { Dialog, DialogTrigger, DialogContent } from "@/components/ui/dialog";
import { Settings } from "lucide-react";
import { Settings, Mic } from "lucide-react";
import VoiceMiniApp from "@/components/voice-mini-app";
import SessionConfigurationPanel from "@/components/session-configuration-panel";
import { EnhancedTranscript } from "@/components/enhanced-transcript";

Expand Down Expand Up @@ -33,6 +34,7 @@ const CallInterface = () => {
const [chatWs, setChatWs] = useState<WebSocket | null>(null);
const [chatStatus, setChatStatus] = useState<'connected' | 'disconnected' | 'connecting'>('disconnected');
const [userText, setUserText] = useState("");
const [voiceAppOpen, setVoiceAppOpen] = useState(false);
const transcript = useTranscript();

const canSendChat = chatStatus === 'connected' && userText.trim().length > 0;
Expand Down Expand Up @@ -126,13 +128,25 @@ const CallInterface = () => {

return (
<div className="h-screen bg-white flex flex-col">
<Dialog open={voiceAppOpen} onOpenChange={setVoiceAppOpen}>
<DialogContent className="max-w-sm w-full">
<VoiceMiniApp />
</DialogContent>
</Dialog>
<Dialog open={setupDialogOpen} onOpenChange={setSetupDialogOpen}>
<TopBar>
<ServiceChecklist
checklistResult={checklistResult}
allConfigsReady={allConfigsReady}
setAllConfigsReady={setAllConfigsReady}
/>
<button
className="p-2 rounded-full hover:bg-gray-100 focus:outline-none focus:ring-2 focus:ring-primary"
aria-label="Open voice mini app"
onClick={() => setVoiceAppOpen(true)}
>
<Mic className="w-5 h-5" />
</button>
<DialogTrigger asChild>
<button
className="p-2 rounded-full hover:bg-gray-100 focus:outline-none focus:ring-2 focus:ring-primary"
Expand Down
88 changes: 88 additions & 0 deletions webapp/components/voice-mini-app.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"use client";

import React, { useRef, useState } from "react";
import { Button } from "@/components/ui/button";
import { useOpenAIRealtime } from "@/lib/use-openai-realtime";
import { RealtimeAgent } from "@openai/agents/realtime";

const VoiceMiniApp = () => {
const audioRef = useRef<HTMLAudioElement>(null);
const { status, connect, disconnect, mute, pushToTalkStart, pushToTalkStop } =
useOpenAIRealtime();
const [playbackEnabled, setPlaybackEnabled] = useState(true);
const [pushToTalk, setPushToTalk] = useState(false);
const [talking, setTalking] = useState(false);

const handleConnect = async () => {
const agent = new RealtimeAgent({
name: "web",
instructions: "You are a helpful voice assistant.",
});
await connect({
getEphemeralKey: async () => {
const res = await fetch("/api/session");
const data = await res.json();
return data?.client_secret?.value;
},
initialAgents: [agent],
audioElement: audioRef.current ?? undefined,
});
};

return (
<div className="flex flex-col gap-4">
<div>
{status === "CONNECTED" ? (
<Button onClick={disconnect}>Disconnect</Button>
) : (
<Button onClick={handleConnect}>Connect</Button>
)}
</div>
<label className="flex items-center gap-2">
<input
type="checkbox"
checked={playbackEnabled}
onChange={(e) => {
const enabled = e.target.checked;
setPlaybackEnabled(enabled);
mute(!enabled);
}}
/>
Audio playback
</label>
<label className="flex items-center gap-2">
<input
type="checkbox"
checked={pushToTalk}
onChange={(e) => setPushToTalk(e.target.checked)}
/>
Push to talk
</label>
{pushToTalk && (
<Button
onMouseDown={() => {
setTalking(true);
pushToTalkStart();
}}
onMouseUp={() => {
setTalking(false);
pushToTalkStop();
}}
onTouchStart={() => {
setTalking(true);
pushToTalkStart();
}}
onTouchEnd={() => {
setTalking(false);
pushToTalkStop();
}}
>
{talking ? "Talking..." : "Talk"}
</Button>
)}
<audio ref={audioRef} autoPlay className="hidden" />
</div>
);
};

export default VoiceMiniApp;
4 changes: 4 additions & 0 deletions webapp/docs/voice-client/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ The voice client is now integrated into the main webapp and accessible at:
http://localhost:3000/voice
```

An additional "Voice Mini-App" button in the top bar opens a lightweight overlay
that connects directly to OpenAI's Realtime API over WebRTC. Set `OPENAI_API_KEY`
in your `.env` file to enable this client.

## Usage

1. **Start the webapp**: `npm run dev` from the webapp directory
Expand Down
32 changes: 32 additions & 0 deletions webapp/lib/codecUtils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
export function audioFormatForCodec(codec: string): 'pcm16' | 'g711_ulaw' | 'g711_alaw' {
let audioFormat: 'pcm16' | 'g711_ulaw' | 'g711_alaw' = 'pcm16';
if (typeof window !== 'undefined') {
const c = codec.toLowerCase();
if (c === 'pcmu') audioFormat = 'g711_ulaw';
else if (c === 'pcma') audioFormat = 'g711_alaw';
}
return audioFormat;
}

// Apply preferred codec on a peer connection's audio transceivers. Safe to call multiple times.
export function applyCodecPreferences(
pc: RTCPeerConnection,
codec: string,
): void {
try {
const caps = (RTCRtpSender as any).getCapabilities?.('audio');
if (!caps) return;

const pref = caps.codecs.find(
(c: any) => c.mimeType.toLowerCase() === `audio/${codec.toLowerCase()}`,
);
if (!pref) return;

pc
.getTransceivers()
.filter((t) => t.sender && t.sender.track?.kind === 'audio')
.forEach((t) => t.setCodecPreferences([pref]));
} catch (err) {
console.error('[codecUtils] applyCodecPreferences error', err);
}
}
90 changes: 90 additions & 0 deletions webapp/lib/use-openai-realtime.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import { useRef, useState, useCallback } from "react";
import {
RealtimeSession,
RealtimeAgent,
OpenAIRealtimeWebRTC,
} from "@openai/agents/realtime";
import { audioFormatForCodec, applyCodecPreferences } from "./codecUtils";

interface ConnectOptions {
getEphemeralKey: () => Promise<string>;
initialAgents: RealtimeAgent[];
audioElement?: HTMLAudioElement;
}

export function useOpenAIRealtime() {
const sessionRef = useRef<RealtimeSession | null>(null);
const [status, setStatus] = useState<
"DISCONNECTED" | "CONNECTING" | "CONNECTED"
>("DISCONNECTED");

const connect = useCallback(
async ({ getEphemeralKey, initialAgents, audioElement }: ConnectOptions) => {
if (sessionRef.current) return;
setStatus("CONNECTING");

const ek = await getEphemeralKey();
const codecParam = (
typeof window !== "undefined"
? new URLSearchParams(window.location.search).get("codec") ?? "opus"
: "opus"
).toLowerCase();
const audioFormat = audioFormatForCodec(codecParam);
const rootAgent = initialAgents[0];

sessionRef.current = new RealtimeSession(rootAgent, {
transport: new OpenAIRealtimeWebRTC({
audioElement,
changePeerConnection: async (pc: RTCPeerConnection) => {
applyCodecPreferences(pc, codecParam);
return pc;
},
}),
model: "gpt-4o-realtime-preview-2025-06-03",
config: {
inputAudioFormat: audioFormat,
outputAudioFormat: audioFormat,
inputAudioTranscription: {
model: "gpt-4o-mini-transcribe",
},
},
});

await sessionRef.current.connect({ apiKey: ek });
setStatus("CONNECTED");
},
[]
);

const disconnect = useCallback(() => {
sessionRef.current?.close();
sessionRef.current = null;
setStatus("DISCONNECTED");
}, []);

const mute = useCallback((m: boolean) => {
sessionRef.current?.mute(m);
}, []);

const pushToTalkStart = useCallback(() => {
sessionRef.current?.transport.sendEvent({
type: "input_audio_buffer.clear",
} as any);
}, []);

const pushToTalkStop = useCallback(() => {
sessionRef.current?.transport.sendEvent({
type: "input_audio_buffer.commit",
} as any);
sessionRef.current?.transport.sendEvent({ type: "response.create" } as any);
}, []);

return {
status,
connect,
disconnect,
mute,
pushToTalkStart,
pushToTalkStop,
} as const;
}
Loading