Skip to content

Realtime

Tolga Kayhan edited this page Nov 11, 2024 · 3 revisions
using System.Text.Json;
using Betalgo.Ranul.OpenAI.Managers;
using Betalgo.Ranul.OpenAI.ObjectModels.RealtimeModels;
using Betalgo.Ranul.OpenAI.ObjectModels.SharedModels;

namespace OpenAI.Playground.TestHelpers.RealtimeHelpers;

/// <summary>
/// A comprehensive example implementation of OpenAI's Realtime API for audio interactions.
/// This class demonstrates how to:
/// - Establish and maintain a WebSocket connection with OpenAI's Realtime server
/// - Handle bidirectional audio streaming
/// - Process transcriptions and responses
/// - Implement function calling capabilities
/// - Manage the full lifecycle of a realtime conversation
/// </summary>
public class RealtimeAudioExample : IDisposable
{
    // Core services for the realtime interaction
    private readonly IOpenAIRealtimeService _ai;        // Manages the WebSocket connection and event handling
    private readonly VoiceInput _voiceInput;            // Handles audio input capture and processing
    private readonly VoiceOutput _voiceOutput;          // Manages audio output playback

    /// <summary>
    /// Initializes a new instance of the RealtimeAudioExample.
    /// Sets up the necessary components for audio interaction with OpenAI's Realtime API.
    /// </summary>
    /// <param name="ai">The OpenAI Realtime service instance that will manage the WebSocket connection</param>
    public RealtimeAudioExample(IOpenAIRealtimeService ai)
    {
        _ai = ai;
        _voiceInput = new(_ai);    // Initialize audio input handling
        _voiceOutput = new();       // Initialize audio output handling
    }

    /// <summary>
    /// Implements IDisposable to properly clean up resources.
    /// This is crucial for releasing audio hardware and closing network connections.
    /// </summary>
    public void Dispose()
    {
        _voiceInput.Dispose();      // Release audio input resources
        _voiceOutput.Dispose();     // Release audio output resources
        _ai.Dispose();              // Close WebSocket connection and clean up
    }

    /// <summary>
    /// Main execution method that orchestrates the entire realtime interaction.
    /// This method:
    /// 1. Sets up all necessary event handlers
    /// 2. Establishes the WebSocket connection
    /// 3. Configures the initial session parameters
    /// 4. Handles user input for recording control
    /// </summary>
    public async Task Run()
    {
        // Initialize all event handlers before connecting
        SetupEventHandlers();

        // Establish WebSocket connection to OpenAI's Realtime server
        // This creates a new session and prepares for bi-directional communication
        await _ai.ConnectAsync();

        // Configure the session with initial settings using session.update event
        // This configuration defines how the AI will behave and what capabilities it has
        await _ai.ClientEvents.Session.Update(new()
        {
            Session = new()
            {
                // Define the AI's personality and behavior
                // This is similar to system messages in the regular Chat API
                Instructions = "You are a great, upbeat friend. You made jokes all the time and your voices is full of joy.",

                // Select the voice for audio responses
                // Options in Realtime API: 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'
                Voice = "verse",

                // Enable both text and audio capabilities
                // This allows the AI to respond with both text transcriptions and spoken audio
                Modalities = ["text", "audio"],

                // Define tools (functions) that the AI can call during conversation
                // This example implements a weather checking function
                Tools =
                [
                    new()
                    {
                        Type = "function",
                        Name = "get_current_weather",
                        Description = "Get the current weather",
                        // Define the function parameters using JSON Schema
                        Parameters = PropertyDefinition.DefineObject(new Dictionary<string, PropertyDefinition>
                        {
                            // Location parameter is required
                            { "location", PropertyDefinition.DefineString("The city and state, e.g. San Francisco, CA") },
                            // Unit parameter is optional but must be either celsius or fahrenheit
                            { "unit", PropertyDefinition.DefineEnum(["celsius", "fahrenheit"], string.Empty) }
                        }, ["location"], null, null, null)
                    }
                ]
            }
        });

        // Main interaction loop - Handle user commands for recording
        Console.WriteLine("Press 'R' to start recording, 'S' to stop, 'Q' to quit");
        while (true)
        {
            var key = Console.ReadKey(true).Key;
            switch (key)
            {
                case ConsoleKey.R:
                    // Start capturing audio input
                    _voiceInput.StartRecording();
                    Console.WriteLine("Recording started...");
                    break;

                case ConsoleKey.S:
                    // Stop recording and process the audio
                    await StopAndSendAudio();
                    break;

                case ConsoleKey.Q:
                    // Exit the application
                    return;
            }
        }
    }

    /// <summary>
    /// Handles the process of stopping audio recording and sending it to OpenAI.
    /// This method:
    /// 1. Stops the audio recording
    /// 2. Commits the recorded audio buffer to create a user message
    /// 3. Requests an AI response
    /// </summary>
    private async Task StopAndSendAudio()
    {
        // Stop capturing audio input
        _voiceInput.StopRecording();
        Console.WriteLine("Recording stopped.");

        // Commit the audio buffer to create a user message
        // This triggers the input_audio_buffer.commit event
        await _ai.ClientEvents.InputAudioBuffer.Commit();

        // Request an AI response for the committed audio
        // This triggers the response.create event
        await _ai.ClientEvents.Response.Create();
    }

    /// <summary>
    /// Utility method to send pre-recorded audio files to the API.
    /// This is useful for testing or processing existing audio files.
    /// </summary>
    /// <param name="filePath">Path to the audio file to be sent</param>
    private async Task SendPreRecordedAudio(string filePath)
    {
        Console.WriteLine($"Sending pre-recorded audio: {filePath}");
        // Send the audio file contents
        await _voiceInput.SendAudioFile(filePath);
        // Commit the audio buffer to create a user message
        await _ai.ClientEvents.InputAudioBuffer.Commit();
    }

    /// <summary>
    /// Sets up all event handlers for the realtime session.
    /// This method configures handlers for:
    /// - Audio input processing and transcription
    /// - Speech detection
    /// - AI response processing
    /// - Function calls
    /// - Error handling
    /// 
    /// Each event handler corresponds to specific server events as defined in the OpenAI Realtime API documentation.
    /// </summary>
    private void SetupEventHandlers()
    {
        // AUDIO INPUT HANDLING EVENTS

        // Handle successful audio transcriptions
        // This event is triggered when input audio is successfully converted to text
        _ai.ServerEvents.Conversation.Item.InputAudioTranscription.OnCompleted += (sender, args) => {
            Console.WriteLine($"Transcription completed: {args.Transcript}");
        };

        // Handle failed transcription attempts
        // This helps identify issues with audio quality or processing
        _ai.ServerEvents.Conversation.Item.InputAudioTranscription.OnFailed += (sender, args) => {
            Console.WriteLine($"Transcription failed: {args.Error}");
        };

        // AUDIO BUFFER STATE EVENTS

        // Triggered when audio buffer is successfully committed
        // This indicates the audio has been properly sent to the server
        _ai.ServerEvents.InputAudioBuffer.OnCommitted += (sender, args) => {
            Console.WriteLine("Audio buffer committed.");
        };

        // Triggered when audio buffer is cleared
        // This happens when starting fresh or discarding unused audio
        _ai.ServerEvents.InputAudioBuffer.OnCleared += (sender, args) => {
            Console.WriteLine("Audio buffer cleared.");
        };

        // SPEECH DETECTION EVENTS

        // Handle speech end detection
        // This helps in identifying when the user has finished speaking
        _ai.ServerEvents.InputAudioBuffer.OnSpeechStopped += (sender, args) => {
            Console.WriteLine("Speech stopped detected.");
        };

        // Handle speech start detection
        // This is useful for implementing real-time interaction
        _ai.ServerEvents.InputAudioBuffer.OnSpeechStarted += async (sender, args) =>
        {
            Console.WriteLine("Speech started detected.");
            // Clear any ongoing audio output when user starts speaking
            _voiceOutput.StopAndClear();

            // Cancel any in-progress AI responses
            // This ensures a more natural conversation flow
            await _ai.ClientEvents.Response.Cancel();
        };

        // AI RESPONSE HANDLING EVENTS

        // Handle incoming text transcripts from the AI
        // This shows what the AI is saying in text form
        _ai.ServerEvents.Response.AudioTranscript.OnDelta += (sender, args) =>
        {
            Console.ForegroundColor = ConsoleColor.DarkGreen;
            Console.Write($"{args.Delta}");
            Console.ResetColor();
        };

        // AUDIO OUTPUT HANDLING

        // Process incoming audio data from the AI
        // This handles the AI's voice response in chunks
        _ai.ServerEvents.Response.Audio.OnDelta += (sender, args) =>
        {
            try
            {
                if (!string.IsNullOrEmpty(args.Delta))
                {
                    // Convert base64 audio data to bytes and queue for playback
                    var audioData = Convert.FromBase64String(args.Delta);
                    _voiceOutput.EnqueueAudioData(audioData);
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Error processing audio delta: {ex.Message}");
            }
        };

        // Handle completion of audio response
        _ai.ServerEvents.Response.Audio.OnDone += (sender, args) =>
        {
            Console.WriteLine();
            Console.WriteLine("Audio response completed.");
        };

        // FUNCTION CALLING EVENTS

        // Handle incoming function call arguments
        // This shows the AI's attempts to use tools/functions
        _ai.ServerEvents.Response.FunctionCallArguments.OnDelta += (sender, args) =>
        {
            Console.ForegroundColor = ConsoleColor.Yellow;
            Console.WriteLine($"Function call arguments delta: {args.Delta}");
            Console.ResetColor();
        };

        // Process completed function calls
        _ai.ServerEvents.Response.FunctionCallArguments.OnDone += async (sender, args) =>
        {
            if (args.Arguments != null)
            {
                Console.WriteLine($"Function call completed: {args.Arguments}");
                // Handle weather function calls specifically
                if (args.Name == "get_current_weather")
                {
                    await HandleWeatherFunction(args.Arguments, args.CallId);
                }
            }
        };

        // ERROR HANDLING

        // Global error handler for any API errors
        _ai.ServerEvents.OnError += (sender, args) => {
            Console.WriteLine($"Error: {args.Error.Message}");
        };

        // Debug event handler for all server events
        //_ai.ServerEvents.OnAll += (sender, args) =>
        //{
        //    Console.WriteLine($"Debug: {args}");
        //};
    }

    /// <summary>
    /// Handles weather function calls from the AI.
    /// This method:
    /// 1. Parses the function arguments
    /// 2. Simulates a weather API call
    /// 3. Returns the results to the AI
    /// 4. Triggers a new response based on the weather data
    /// </summary>
    /// <param name="arguments">JSON string containing the function arguments</param>
    /// <param name="callId">Unique identifier for the function call</param>
    private async Task HandleWeatherFunction(string arguments, string callId)
    {
        try
        {
            // Parse the weather query arguments
            var args = JsonSerializer.Deserialize<WeatherArgs>(arguments);

            // Simulate getting weather data
            // In a real application, this would call an actual weather API
            var weatherResult = new
            {
                temperature = args.unit == "celsius" ? 22 : 72,
                unit = args.unit,
                description = "Sunny with light clouds",
                location = args.location
            };

            // Send the weather data back to the conversation
            // This creates a function_call_output item in the conversation
            await _ai.ClientEvents.Conversation.Item.Create(new()
            {
                Item = new()
                {
                    Type = ItemType.FunctionCallOutput,
                    CallId = callId,
                    Output = JsonSerializer.Serialize(weatherResult)
                }
            });

            // Request a new AI response based on the weather data
            await _ai.ClientEvents.Response.Create();
        }
        catch (Exception ex)
        {
            Console.WriteLine($"Error handling weather function: {ex.Message}");
        }
    }

    /// <summary>
    /// Data model for weather function arguments.
    /// This class maps to the JSON schema defined in the function parameters.
    /// </summary>
    private class WeatherArgs
    {
        public string location { get; set; }    // Required: city and state
        public string unit { get; set; }        // Optional: celsius or fahrenheit
    }
}

Voice Input

/// <summary>
/// Handles voice input capture and processing for real-time communication with OpenAI's API.
/// This class manages audio recording, buffering, and transmission of audio data.
/// </summary>
public class VoiceInput : IDisposable
{
    // Minimum amount of audio to buffer before sending (in milliseconds)
    private const int MinimumBufferMs = 100;

    // Buffer to store audio data before sending
    private readonly List<byte> _audioBuffer;

    // Reference to the OpenAI real-time service client
    private readonly IOpenAIRealtimeService _client;

    // NAudio's wave input device for capturing audio
    private readonly WaveInEvent _waveIn;

    // Flag to track recording state
    private bool _isRecording;

    /// <summary>
    /// Initializes a new instance of VoiceInput with specified OpenAI client.
    /// </summary>
    /// <param name="client">The OpenAI real-time service client</param>
    public VoiceInput(IOpenAIRealtimeService client)
    {
        _client = client;
        // Configure audio input with specific format:
        // - 24000 Hz sample rate
        // - 16 bits per sample
        // - 1 channel (mono)
        _waveIn = new()
        {
            WaveFormat = new(24000, 16, 1),
            BufferMilliseconds = 50  // How often to receive audio data
        };
        _audioBuffer = [];
        _waveIn.DataAvailable += OnDataAvailable!;
    }

    /// <summary>
    /// Releases resources used by the voice input system
    /// </summary>
    public void Dispose()
    {
        _waveIn.Dispose();
    }

    /// <summary>
    /// Starts recording audio from the default input device
    /// </summary>
    public void StartRecording()
    {
        if (_isRecording) return;
        _isRecording = true;
        _audioBuffer.Clear();
        _waveIn.StartRecording();
    }

    /// <summary>
    /// Stops recording audio and sends any remaining buffered data
    /// </summary>
    public void StopRecording()
    {
        if (!_isRecording) return;
        _isRecording = false;
        _waveIn.StopRecording();

        // Send any remaining buffered audio before stopping
        if (_audioBuffer.Count > 0)
        {
            _client.ClientEvents.InputAudioBuffer.Append(_audioBuffer.ToArray());
            _audioBuffer.Clear();
        }
    }

    /// <summary>
    /// Handles incoming audio data from the recording device
    /// </summary>
    private void OnDataAvailable(object sender, WaveInEventArgs e)
    {
        if (!_isRecording) return;

        // Add new audio data to the buffer
        _audioBuffer.AddRange(e.Buffer.Take(e.BytesRecorded));

        // Calculate current buffer duration in milliseconds
        var bufferDurationMs = _audioBuffer.Count * 1000.0 / _waveIn.WaveFormat.AverageBytesPerSecond;

        // Only send when we have accumulated enough audio data
        if (bufferDurationMs >= MinimumBufferMs)
        {
            _client.ClientEvents.InputAudioBuffer.Append(_audioBuffer.ToArray());
            _audioBuffer.Clear();
        }
    }

    /// <summary>
    /// Sends an audio file to the OpenAI API by streaming it in chunks
    /// </summary>
    /// <param name="filePath">Path to the audio file to send</param>
    public async Task SendAudioFile(string filePath)
    {
        using var audioFileReader = new AudioFileReader(filePath);
        // Calculate buffer size based on minimum buffer duration
        var bufferSize = (int)(audioFileReader.WaveFormat.AverageBytesPerSecond * (MinimumBufferMs / 1000.0));
        var buffer = new byte[bufferSize];
        int bytesRead;

        // Read and send the file in chunks
        while ((bytesRead = await audioFileReader.ReadAsync(buffer, 0, buffer.Length)) > 0)
        {
            if (bytesRead < buffer.Length)
            {
                // Handle the last chunk if it's smaller than the buffer
                var lastBuffer = new byte[bytesRead];
                Array.Copy(buffer, lastBuffer, bytesRead);
                buffer = lastBuffer;
            }

            // Resample the audio to match required format and send
            var resampledBuffer = ResampleAudio(buffer, bytesRead, audioFileReader.WaveFormat, _waveIn.WaveFormat);
            await _client.ClientEvents.InputAudioBuffer.Append(resampledBuffer);
        }
    }

    /// <summary>
    /// Resamples audio data to match the target format required by the API
    /// </summary>
    /// <param name="buffer">Original audio data</param>
    /// <param name="bytesRead">Number of bytes in the buffer</param>
    /// <param name="sourceFormat">Original audio format</param>
    /// <param name="targetFormat">Desired output format</param>
    /// <returns>Resampled audio data</returns>
    private static byte[] ResampleAudio(byte[] buffer, int bytesRead, WaveFormat sourceFormat, WaveFormat targetFormat)
    {
        // Skip resampling if formats match
        if (sourceFormat.SampleRate == targetFormat.SampleRate &&
            sourceFormat.BitsPerSample == targetFormat.BitsPerSample &&
            sourceFormat.Channels == targetFormat.Channels)
        {
            var trimmedBuffer = new byte[bytesRead];
            Array.Copy(buffer, trimmedBuffer, bytesRead);
            return trimmedBuffer;
        }

        // Perform resampling using MediaFoundation
        using var sourceStream = new RawSourceWaveStream(buffer, 0, bytesRead, sourceFormat);
        using var resampler = new MediaFoundationResampler(sourceStream, targetFormat);
        resampler.ResamplerQuality = 60;  // Set high quality resampling

        // Calculate and allocate buffer for resampled audio
        var resampledBytes = (int)(bytesRead * ((double)targetFormat.AverageBytesPerSecond / sourceFormat.AverageBytesPerSecond));
        var resampledBuffer = new byte[resampledBytes];
        var resampledBytesRead = resampler.Read(resampledBuffer, 0, resampledBytes);

        // Trim the buffer to actual size and return
        var trimmedBuffer2 = new byte[resampledBytesRead];
        Array.Copy(resampledBuffer, trimmedBuffer2, resampledBytesRead);
        return trimmedBuffer2;
    }
}

Voice Output

using Betalgo.Ranul.OpenAI.ObjectModels.RealtimeModels;
using NAudio.Wave;

namespace OpenAI.Playground.TestHelpers.RealtimeHelpers;

/// <summary>
/// Handles real-time audio playback for OpenAI's audio responses
/// Manages buffering and streaming of audio data
/// </summary>
public class VoiceOutput : IDisposable
{
    // Core components for audio handling
    private readonly BufferedWaveProvider _bufferedWaveProvider;  // Manages audio data buffering
    private readonly WaveOutEvent _waveOut;                      // Handles audio output device
    private bool _isPlaying;                                     // Tracks current playback status

    /// <summary>
    /// Initializes the voice output system with OpenAI's default audio settings
    /// </summary>
    public VoiceOutput()
    {
        // Initialize audio output device
        _waveOut = new();
        // Register for playback stopped events
        _waveOut.PlaybackStopped += OnPlaybackStopped!;

        // Configure audio buffer with OpenAI's default settings
        _bufferedWaveProvider = new(new(
            RealtimeConstants.Audio.DefaultSampleRate,       // Standard sample rate
            RealtimeConstants.Audio.DefaultBitsPerSample,    // Bit depth for audio
            RealtimeConstants.Audio.DefaultChannels          // Number of audio channels
        ))
        {
            BufferLength = 10 * 1024 * 1024,    // Set 10 MB buffer size for smooth playback
            DiscardOnBufferOverflow = true       // Prevent buffer overflow by discarding excess data
        };

        // Connect the buffer to the audio output
        _waveOut.Init(_bufferedWaveProvider);
    }

    /// <summary>
    /// Cleanup resources when object is disposed
    /// </summary>
    public void Dispose()
    {
        // Stop playback and release audio device resources
        _waveOut.Stop();
        _waveOut.Dispose();
    }

    /// <summary>
    /// Add new audio data to the playback queue
    /// Automatically starts playback if not already playing
    /// </summary>
    /// <param name="data">Raw audio data bytes to be played</param>
    public void EnqueueAudioData(byte[]? data)
    {
        // Ignore empty or null data
        if (data == null || data.Length == 0)
            return;

        // Add new audio data to the buffer
        _bufferedWaveProvider.AddSamples(data, 0, data.Length);

        // Start playback if not already playing
        if (!_isPlaying)
        {
            _waveOut.Play();
            _isPlaying = true;
        }
    }

    /// <summary>
    /// Stops playback and clears any remaining buffered audio
    /// </summary>
    public void StopAndClear()
    {
        // Stop playback if currently playing
        if (_isPlaying)
        {
            _waveOut.Stop();
            _isPlaying = false;
        }

        // Clear any remaining audio from buffer
        _bufferedWaveProvider.ClearBuffer();
        Console.WriteLine("Playback stopped and buffer cleared.");
    }

    /// <summary>
    /// Event handler for when playback stops
    /// Restarts playback if there's more data in buffer
    /// </summary>
    private void OnPlaybackStopped(object sender, StoppedEventArgs e)
    {
        // If there's more audio in the buffer, continue playing
        if (_bufferedWaveProvider.BufferedBytes > 0)
        {
            _waveOut.Play();
        }
        // Otherwise, mark playback as stopped
        else
        {
            _isPlaying = false;
        }
    }
}
Clone this wiki locally