adamd9 · adamd9 · Aug 5, 2025
diff --git a/webapp/.env.example b/webapp/.env.example
@@ -4,4 +4,6 @@ TWILIO_ACCOUNT_SID=""
 TWILIO_AUTH_TOKEN=""
 
 # Optional - if using remote backend
-REMOTE_BACKEND=""
+REMOTE_BACKEND=""
+
+OPENAI_API_KEY=""
diff --git a/webapp/app/api/session/route.ts b/webapp/app/api/session/route.ts
@@ -0,0 +1,26 @@
+export async function GET() {
+  try {
+    const r = await fetch("https://api.openai.com/v1/realtime/sessions", {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model: "gpt-4o-realtime-preview-2025-06-03",
+        voice: "verse",
+      }),
+    });
+
+    if (!r.ok) {
+      const error = await r.text();
+      return Response.json({ error }, { status: 500 });
+    }
+
+    const data = await r.json();
+    return Response.json(data);
+  } catch (error) {
+    console.error("Error creating realtime session:", error);
+    return Response.json({ error: "Internal server error" }, { status: 500 });
+  }
+}
diff --git a/webapp/components/call-interface.tsx b/webapp/components/call-interface.tsx
@@ -3,7 +3,8 @@
 import React, { useState, useEffect } from "react";
 import TopBar from "@/components/top-bar";
 import { Dialog, DialogTrigger, DialogContent } from "@/components/ui/dialog";
-import { Settings } from "lucide-react";
+import { Settings, Mic } from "lucide-react";
+import VoiceMiniApp from "@/components/voice-mini-app";
 import SessionConfigurationPanel from "@/components/session-configuration-panel";
 import { EnhancedTranscript } from "@/components/enhanced-transcript";
 
@@ -33,6 +34,7 @@ const CallInterface = () => {
   const [chatWs, setChatWs] = useState<WebSocket | null>(null);
   const [chatStatus, setChatStatus] = useState<'connected' | 'disconnected' | 'connecting'>('disconnected');
   const [userText, setUserText] = useState("");
+  const [voiceAppOpen, setVoiceAppOpen] = useState(false);
   const transcript = useTranscript();
 
   const canSendChat = chatStatus === 'connected' && userText.trim().length > 0;
@@ -126,13 +128,25 @@ const CallInterface = () => {
 
   return (
     <div className="h-screen bg-white flex flex-col">
+      <Dialog open={voiceAppOpen} onOpenChange={setVoiceAppOpen}>
+        <DialogContent className="max-w-sm w-full">
+          <VoiceMiniApp />
+        </DialogContent>
+      </Dialog>
       <Dialog open={setupDialogOpen} onOpenChange={setSetupDialogOpen}>
         <TopBar>
           <ServiceChecklist
             checklistResult={checklistResult}
             allConfigsReady={allConfigsReady}
             setAllConfigsReady={setAllConfigsReady}
           />
+          <button
+            className="p-2 rounded-full hover:bg-gray-100 focus:outline-none focus:ring-2 focus:ring-primary"
+            aria-label="Open voice mini app"
+            onClick={() => setVoiceAppOpen(true)}
+          >
+            <Mic className="w-5 h-5" />
+          </button>
           <DialogTrigger asChild>
             <button
               className="p-2 rounded-full hover:bg-gray-100 focus:outline-none focus:ring-2 focus:ring-primary"

diff --git a/webapp/components/voice-mini-app.tsx b/webapp/components/voice-mini-app.tsx
@@ -0,0 +1,88 @@
+"use client";
+
+import React, { useRef, useState } from "react";
+import { Button } from "@/components/ui/button";
+import { useOpenAIRealtime } from "@/lib/use-openai-realtime";
+import { RealtimeAgent } from "@openai/agents/realtime";
+
+const VoiceMiniApp = () => {
+  const audioRef = useRef<HTMLAudioElement>(null);
+  const { status, connect, disconnect, mute, pushToTalkStart, pushToTalkStop } =
+    useOpenAIRealtime();
+  const [playbackEnabled, setPlaybackEnabled] = useState(true);
+  const [pushToTalk, setPushToTalk] = useState(false);
+  const [talking, setTalking] = useState(false);
+
+  const handleConnect = async () => {
+    const agent = new RealtimeAgent({
+      name: "web",
+      instructions: "You are a helpful voice assistant.",
+    });
+    await connect({
+      getEphemeralKey: async () => {
+        const res = await fetch("/api/session");
+        const data = await res.json();
+        return data?.client_secret?.value;
+      },
+      initialAgents: [agent],
+      audioElement: audioRef.current ?? undefined,
+    });
+  };
+
+  return (
+    <div className="flex flex-col gap-4">
+      <div>
+        {status === "CONNECTED" ? (
+          <Button onClick={disconnect}>Disconnect</Button>
+        ) : (
+          <Button onClick={handleConnect}>Connect</Button>
+        )}
+      </div>
+      <label className="flex items-center gap-2">
+        <input
+          type="checkbox"
+          checked={playbackEnabled}
+          onChange={(e) => {
+            const enabled = e.target.checked;
+            setPlaybackEnabled(enabled);
+            mute(!enabled);
+          }}
+        />
+        Audio playback
+      </label>
+      <label className="flex items-center gap-2">
+        <input
+          type="checkbox"
+          checked={pushToTalk}
+          onChange={(e) => setPushToTalk(e.target.checked)}
+        />
+        Push to talk
+      </label>
+      {pushToTalk && (
+        <Button
+          onMouseDown={() => {
+            setTalking(true);
+            pushToTalkStart();
+          }}
+          onMouseUp={() => {
+            setTalking(false);
+            pushToTalkStop();
+          }}
+          onTouchStart={() => {
+            setTalking(true);
+            pushToTalkStart();
+          }}
+          onTouchEnd={() => {
+            setTalking(false);
+            pushToTalkStop();
+          }}
+        >
+          {talking ? "Talking..." : "Talk"}
+        </Button>
+      )}
+      <audio ref={audioRef} autoPlay className="hidden" />
+    </div>
+  );
+};
+
+export default VoiceMiniApp;
diff --git a/webapp/docs/voice-client/README.md b/webapp/docs/voice-client/README.md
@@ -24,6 +24,10 @@ The voice client is now integrated into the main webapp and accessible at:
 http://localhost:3000/voice
 ```
 
+An additional "Voice Mini-App" button in the top bar opens a lightweight overlay
+that connects directly to OpenAI's Realtime API over WebRTC. Set `OPENAI_API_KEY`
+in your `.env` file to enable this client.
+
 ## Usage
 
 1. **Start the webapp**: `npm run dev` from the webapp directory

diff --git a/webapp/lib/codecUtils.ts b/webapp/lib/codecUtils.ts
@@ -0,0 +1,32 @@
+export function audioFormatForCodec(codec: string): 'pcm16' | 'g711_ulaw' | 'g711_alaw' {
+  let audioFormat: 'pcm16' | 'g711_ulaw' | 'g711_alaw' = 'pcm16';
+  if (typeof window !== 'undefined') {
+    const c = codec.toLowerCase();
+    if (c === 'pcmu') audioFormat = 'g711_ulaw';
+    else if (c === 'pcma') audioFormat = 'g711_alaw';
+  }
+  return audioFormat;
+}
+
+// Apply preferred codec on a peer connection's audio transceivers. Safe to call multiple times.
+export function applyCodecPreferences(
+  pc: RTCPeerConnection,
+  codec: string,
+): void {
+  try {
+    const caps = (RTCRtpSender as any).getCapabilities?.('audio');
+    if (!caps) return;
+
+    const pref = caps.codecs.find(
+      (c: any) => c.mimeType.toLowerCase() === `audio/${codec.toLowerCase()}`,
+    );
+    if (!pref) return;
+
+    pc
+      .getTransceivers()
+      .filter((t) => t.sender && t.sender.track?.kind === 'audio')
+      .forEach((t) => t.setCodecPreferences([pref]));
+  } catch (err) {
+    console.error('[codecUtils] applyCodecPreferences error', err);
+  }
+}
diff --git a/webapp/lib/use-openai-realtime.ts b/webapp/lib/use-openai-realtime.ts
@@ -0,0 +1,90 @@
+import { useRef, useState, useCallback } from "react";
+import {
+  RealtimeSession,
+  RealtimeAgent,
+  OpenAIRealtimeWebRTC,
+} from "@openai/agents/realtime";
+import { audioFormatForCodec, applyCodecPreferences } from "./codecUtils";
+
+interface ConnectOptions {
+  getEphemeralKey: () => Promise<string>;
+  initialAgents: RealtimeAgent[];
+  audioElement?: HTMLAudioElement;
+}
+
+export function useOpenAIRealtime() {
+  const sessionRef = useRef<RealtimeSession | null>(null);
+  const [status, setStatus] = useState<
+    "DISCONNECTED" | "CONNECTING" | "CONNECTED"
+  >("DISCONNECTED");
+
+  const connect = useCallback(
+    async ({ getEphemeralKey, initialAgents, audioElement }: ConnectOptions) => {
+      if (sessionRef.current) return;
+      setStatus("CONNECTING");
+
+      const ek = await getEphemeralKey();
+      const codecParam = (
+        typeof window !== "undefined"
+          ? new URLSearchParams(window.location.search).get("codec") ?? "opus"
+          : "opus"
+      ).toLowerCase();
+      const audioFormat = audioFormatForCodec(codecParam);
+      const rootAgent = initialAgents[0];
+
+      sessionRef.current = new RealtimeSession(rootAgent, {
+        transport: new OpenAIRealtimeWebRTC({
+          audioElement,
+          changePeerConnection: async (pc: RTCPeerConnection) => {
+            applyCodecPreferences(pc, codecParam);
+            return pc;
+          },
+        }),
+        model: "gpt-4o-realtime-preview-2025-06-03",
+        config: {
+          inputAudioFormat: audioFormat,
+          outputAudioFormat: audioFormat,
+          inputAudioTranscription: {
+            model: "gpt-4o-mini-transcribe",
+          },
+        },
+      });
+
+      await sessionRef.current.connect({ apiKey: ek });
+      setStatus("CONNECTED");
+    },
+    []
+  );
+
+  const disconnect = useCallback(() => {
+    sessionRef.current?.close();
+    sessionRef.current = null;
+    setStatus("DISCONNECTED");
+  }, []);
+
+  const mute = useCallback((m: boolean) => {
+    sessionRef.current?.mute(m);
+  }, []);
+
+  const pushToTalkStart = useCallback(() => {
+    sessionRef.current?.transport.sendEvent({
+      type: "input_audio_buffer.clear",
+    } as any);
+  }, []);
+
+  const pushToTalkStop = useCallback(() => {
+    sessionRef.current?.transport.sendEvent({
+      type: "input_audio_buffer.commit",
+    } as any);
+    sessionRef.current?.transport.sendEvent({ type: "response.create" } as any);
+  }, []);
+
+  return {
+    status,
+    connect,
+    disconnect,
+    mute,
+    pushToTalkStart,
+    pushToTalkStop,
+  } as const;
+}