Skip to content
adham90 edited this page Feb 20, 2026 · 4 revisions

Audio Support

RubyLLM::Agents provides two base classes for audio operations:

  • Transcriber - Audio-to-text (speech recognition)
  • Speaker - Text-to-audio (text-to-speech / TTS)

Table of Contents


Transcription (Audio → Text)

Convert audio files to text using speech recognition models.

Transcriber Quick Start

# Generate a transcriber
rails generate ruby_llm_agents:transcriber meeting

# app/agents/audio/meeting_transcriber.rb
class MeetingTranscriber < ApplicationTranscriber
  model "whisper-1"
  language "en"
end

# Usage
result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")
result.text           # "Hello, welcome to the meeting..."
result.audio_duration # 120.5 (seconds)
result.total_cost     # 0.012

Transcriber DSL

class MyTranscriber < ApplicationTranscriber
  # Model selection
  model "whisper-1"              # Default transcription model
  # Alternatives: "gpt-4o-transcribe", "gpt-4o-mini-transcribe"

  # Language settings
  language "en"                   # ISO 639-1 code (nil = auto-detect)

  # Output format
  output_format :text             # :text, :json, :srt, :vtt

  # Timestamp granularity
  include_timestamps :segment     # :none, :segment, :word

  # Caching
  cache_for 30.days               # Enable caching

  # Optional: Provide context for better accuracy
  def prompt
    "Technical discussion about Ruby programming"
  end

  # Optional: Post-process transcription
  def postprocess_text(text)
    text
      .gsub(/\bum\b/i, '')       # Remove filler words
      .gsub(/\buh\b/i, '')
      .squeeze(' ')
  end
end

Input Sources

# From file path
result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")

# From URL
result = Audio::MeetingTranscriber.call(audio: "https://example.com/audio.mp3")

# From File object
result = Audio::MeetingTranscriber.call(audio: File.open("meeting.mp3"))

# From binary data with format hint
result = Audio::MeetingTranscriber.call(audio: audio_blob, format: :mp3)

Reliability Configuration

class ReliableTranscriber < ApplicationTranscriber
  model "gpt-4o-transcribe"

  reliability do
    retry_on_failure max_attempts: 3
  end

  fallback_models "whisper-1", "gpt-4o-mini-transcribe"
end

TranscriptionResult

result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")

# Text content
result.text              # Full transcription text
result.segments          # Array of segments with timestamps
result.words             # Array of words with timestamps (if requested)

# Audio metadata
result.audio_duration    # Duration in seconds
result.audio_format      # Detected format
result.language          # Requested language
result.detected_language # Auto-detected language

# Execution metadata
result.model_id          # Model used
result.duration_ms       # Processing time
result.total_cost        # Cost in USD
result.started_at        # Execution start time
result.completed_at      # Execution end time
result.tenant_id         # Multi-tenant identifier

# Status
result.success?          # true if no error
result.error?            # true if failed

# Subtitle generation
result.srt               # SRT subtitle format
result.vtt               # VTT subtitle format

# Analysis helpers
result.words_per_minute        # Speaking rate
result.segment_at(30.5)        # Find segment at timestamp
result.text_between(10, 60)    # Get text in time range

Subtitle Generation

class SubtitleTranscriber < ApplicationTranscriber
  model "whisper-1"
  include_timestamps :segment
end

result = Audio::SubtitleTranscriber.call(audio: "video.mp4")

# Save as SRT (for video players)
File.write("captions.srt", result.srt)

# Save as VTT (for web video)
File.write("captions.vtt", result.vtt)

SRT Output:

1
00:00:00,000 --> 00:00:02,500
Hello everyone.

2
00:00:02,500 --> 00:00:05,000
Welcome to the meeting.

VTT Output:

WEBVTT

00:00:00.000 --> 00:00:02.500
Hello everyone.

00:00:02.500 --> 00:00:05.000
Welcome to the meeting.

Text-to-Speech (Text → Audio)

Generate natural speech audio from text.

Speaker Quick Start

# Generate a speaker
rails generate ruby_llm_agents:speaker narrator

# app/agents/audio/narrator_speaker.rb
class NarratorSpeaker < ApplicationSpeaker
  provider :openai
  model "tts-1"
  voice "nova"
end

# Usage
result = Audio::NarratorSpeaker.call(text: "Hello, world!")
result.audio          # Binary audio data
result.save_to("output.mp3")

Speaker DSL

class MyNarrator < ApplicationSpeaker
  # Provider selection
  provider :openai              # :openai, :elevenlabs

  # Model and voice
  model "tts-1-hd"              # "tts-1" for faster/cheaper
  voice "nova"                  # Voice name
  # OpenAI voices: alloy, echo, fable, nova, onyx, shimmer

  # Voice ID (ElevenLabs)
  voice_id "21m00Tcm..."        # Voice ID for cloned voices

  # Audio settings
  speed 1.0                     # Speech speed (0.25-4.0 for OpenAI)
  output_format :mp3            # :mp3, :wav, :opus, :pcm, :alaw, :ulaw
  # Or use ElevenLabs native format strings directly:
  # output_format "mp3_44100_192"  # High-bitrate MP3
  # output_format "opus_48000_64"  # Low-latency Opus
  # output_format "pcm_16000"      # Telephony PCM

  # Streaming
  streaming true                # Enable streaming mode

  # Caching
  cache_for 7.days              # Enable caching

  # Custom pronunciation lexicon
  lexicon do
    pronounce "API", "A P I"
    pronounce "SQL", "sequel"
    pronounce "RubyLLM", "ruby L L M"
    pronounce "nginx", "engine-X"
  end
end

SpeechResult

result = Audio::ArticleNarrator.call(text: "Hello!")

# Audio data
result.audio          # Binary audio data
result.save_to(path)  # Save to file
result.to_base64      # Base64 encoded
result.to_data_uri    # Data URI for web embedding

# Metadata
result.duration       # Audio duration (seconds)
result.format         # Output format (:mp3, :wav, etc.)
result.file_size      # Size in bytes
result.characters     # Input character count

# Provider info
result.provider       # :openai, :elevenlabs
result.model_id       # Model used
result.voice_id       # Voice identifier
result.voice_name     # Voice name

# Execution
result.duration_ms    # Processing time
result.total_cost     # Cost in USD
result.started_at     # Execution start time
result.completed_at   # Execution end time
result.tenant_id      # Multi-tenant identifier
result.success?       # true if no error

Streaming Audio

class StreamingNarrator < ApplicationSpeaker
  provider :elevenlabs
  voice "Rachel"
  streaming true
end

# Stream to audio player
Audio::StreamingNarrator.stream(text: "Long article...") do |chunk|
  audio_player.play(chunk.audio)
end

# Force streaming on any speaker
Audio::ArticleNarrator.stream(text: "Hello!") do |chunk|
  buffer << chunk.audio
end

ElevenLabs Configuration

class PremiumNarrator < ApplicationSpeaker
  provider :elevenlabs
  model "eleven_multilingual_v2"
  voice_id "21m00Tcm4TlvDq8ikWAM"

  # Voice settings specific to ElevenLabs
  voice_settings do
    stability 0.5            # 0-1: Lower = more expressive
    similarity_boost 0.75    # 0-1: Higher = closer to original voice
    style 0.5                # 0-1: Style exaggeration
    speaker_boost true       # Enhance speaker clarity
  end
end

Reliability Configuration

class ReliableSpeaker < ApplicationSpeaker
  provider :elevenlabs
  voice "Rachel"

  reliability do
    retry_on_failure max_attempts: 3
  end

  fallback_models "tts-1-hd", "tts-1"
end

Generators

Generate a Transcriber

# Basic transcriber
rails generate ruby_llm_agents:transcriber meeting

# With options
rails generate ruby_llm_agents:transcriber meeting --model gpt-4o-transcribe
rails generate ruby_llm_agents:transcriber meeting --language es
rails generate ruby_llm_agents:transcriber meeting --output-format srt
rails generate ruby_llm_agents:transcriber meeting --cache 30.days

This creates:

  • app/agents/audio/application_transcriber.rb (if not exists)
  • app/agents/audio/meeting_transcriber.rb

Generate a Speaker

# Basic speaker
rails generate ruby_llm_agents:speaker narrator

# With options
rails generate ruby_llm_agents:speaker narrator --provider elevenlabs
rails generate ruby_llm_agents:speaker narrator --voice alloy
rails generate ruby_llm_agents:speaker narrator --speed 1.25
rails generate ruby_llm_agents:speaker narrator --format wav
rails generate ruby_llm_agents:speaker narrator --cache 7.days

This creates:

  • app/agents/audio/application_speaker.rb (if not exists)
  • app/agents/audio/narrator_speaker.rb

Configuration

Global Defaults

# config/initializers/ruby_llm_agents.rb
RubyLLM::Agents.configure do |config|
  # Transcription defaults
  config.default_transcription_model = "whisper-1"
  config.track_transcriptions = true

  # TTS defaults
  config.default_tts_provider = :openai
  config.default_tts_model = "tts-1"
  config.default_tts_voice = "nova"
  config.track_speech = true

  # ElevenLabs (required for :elevenlabs provider)
  config.elevenlabs_api_key = ENV["ELEVENLABS_API_KEY"]
  config.elevenlabs_api_base = "https://api.elevenlabs.io"  # optional, default

  # ElevenLabs dynamic pricing (fetches model data from /v1/models API)
  config.elevenlabs_base_cost_per_1k = 0.30  # Base cost × model multiplier (default: $0.30 Pro overage)
  config.elevenlabs_models_cache_ttl = 21_600  # Cache TTL in seconds (default: 6 hours)

  # TTS pricing overrides (per 1K characters) — takes priority over API pricing
  config.tts_model_pricing = {
    "eleven_v3" => 0.24,        # custom rate
    "custom-model" => 0.10
  }
  config.default_tts_cost = 0.015  # fallback for unknown models
end

Supported Models

Transcription Models

Provider Model Notes
OpenAI whisper-1 Default, most reliable
OpenAI gpt-4o-transcribe Faster, better accuracy
OpenAI gpt-4o-mini-transcribe Budget option

TTS Models

Provider Model Generation Notes
OpenAI tts-1 Standard quality, low latency
OpenAI tts-1-hd HD quality
ElevenLabs eleven_monolingual_v1 v1 English only (deprecated)
ElevenLabs eleven_multilingual_v1 v1 Multi-language (deprecated)
ElevenLabs eleven_multilingual_v2 v2 29 languages, emotionally-aware
ElevenLabs eleven_turbo_v2 v2 English, balanced speed/quality
ElevenLabs eleven_flash_v2 v2 English, ultra-low latency (~75ms)
ElevenLabs eleven_turbo_v2_5 v2.5 32 languages, balanced
ElevenLabs eleven_flash_v2_5 v2.5 32 languages, low latency
ElevenLabs eleven_v3 v3 70+ languages, most expressive

OpenAI voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer


Cost Tracking

Transcription Costs

Transcription pricing is resolved automatically via the multi-source pricing cascade. The system checks user config, the RubyLLM gem, LiteLLM, Portkey AI, and other sources to find the best available price.

Model Typical Price
whisper-1 ~$0.006 / minute
gpt-4o-transcribe ~$0.01 / minute
gpt-4o-mini-transcribe ~$0.005 / minute

Costs are calculated based on audio duration. Override pricing for any model:

RubyLLM::Agents.configure do |c|
  c.transcription_model_pricing = {
    "whisper-1" => 0.006,
    "custom-model" => 0.05
  }
end

See Pricing for full details on sources, caching, and debugging.

TTS Costs

TTS costs use a 4-tier pricing cascade:

  1. LiteLLM JSON (auto-updating) — checked first for any model
  2. config.tts_model_pricing — user overrides per model
  3. ElevenLabs API — fetches character_cost_multiplier from /v1/models × elevenlabs_base_cost_per_1k
  4. Hardcoded fallbacks — last resort if API is unreachable
Provider Model Price / 1K chars Multiplier
OpenAI tts-1 $0.015
OpenAI tts-1-hd $0.030
ElevenLabs eleven_flash_v2, eleven_flash_v2_5 $0.15 0.5×
ElevenLabs eleven_turbo_v2, eleven_turbo_v2_5 $0.15 0.5×
ElevenLabs eleven_multilingual_v2 $0.30 1.0×
ElevenLabs eleven_v3 $0.30 1.0×
ElevenLabs v1 models (deprecated) $0.30 1.0×

ElevenLabs prices are dynamically calculated: elevenlabs_base_cost_per_1k × character_cost_multiplier. The default base rate ($0.30) matches the Pro plan overage rate. Users on different plans can override:

# Starter plan
config.elevenlabs_base_cost_per_1k = 0.30

# Or override specific models directly
config.tts_model_pricing = { "eleven_v3" => 0.24 }

ElevenLabs Output Formats

ElevenLabs supports 28+ native output format strings. Use simple symbols for convenience or native strings for precise control:

# Simple symbols (convenience mapping)
MyNarrator.call(text: "Hello", format: :mp3)    # → mp3_44100_128
MyNarrator.call(text: "Hello", format: :wav)    # → wav_44100
MyNarrator.call(text: "Hello", format: :opus)   # → opus_48000_128
MyNarrator.call(text: "Hello", format: :pcm)    # → pcm_24000

# Native format strings (precise control)
MyNarrator.call(text: "Hello", format: "mp3_44100_192")  # High-bitrate MP3
MyNarrator.call(text: "Hello", format: "pcm_16000")      # Telephony PCM
MyNarrator.call(text: "Hello", format: "opus_48000_64")   # Low-bitrate Opus
MyNarrator.call(text: "Hello", format: "wav_22050")       # Low sample rate WAV

ElevenLabs Model Validation

Speaker automatically validates ElevenLabs models before making API calls:

  • Non-TTS models (e.g., eleven_english_sts_v2) → raises ConfigurationError
  • Text exceeding max characters → logs a warning (does not block)
  • Unsupported voice settings (e.g., style on eleven_v3) → logs a warning

Multi-Tenancy

Audio operations fully support multi-tenancy:

# Using resolver
result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")
# Automatically uses Current.tenant if configured

# Explicit tenant
result = Audio::MeetingTranscriber.call(
  audio: "meeting.mp3",
  tenant: "acme_corp"
)

# Tenant with budget limits
result = Audio::ArticleNarrator.call(
  text: "Hello world",
  tenant: {
    id: "acme_corp",
    daily_limit: 50.0,
    enforcement: :hard
  }
)

Examples

Meeting Transcription

class MeetingTranscriber < ApplicationTranscriber
  model "whisper-1"
  language "en"
  include_timestamps :segment

  def prompt
    "Business meeting with technical discussions about software"
  end

  def postprocess_text(text)
    text
      .gsub(/\bum\b/i, '')
      .gsub(/\buh\b/i, '')
      .squeeze(' ')
  end
end

result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")
puts result.text
puts "Duration: #{result.audio_duration} seconds"
puts "Cost: $#{result.total_cost}"

Article Narration

class ArticleNarrator < ApplicationSpeaker
  provider :openai
  model "tts-1-hd"
  voice "nova"
  speed 1.1

  lexicon do
    pronounce "API", "A P I"
    pronounce "JSON", "jay-son"
  end
end

result = Audio::ArticleNarrator.call(text: article_content)
result.save_to("article_audio.mp3")
puts "Duration: #{result.duration} seconds"
puts "Cost: $#{result.total_cost}"

Voice Assistant Pipeline

class VoiceAssistant
  def process_query(audio_file)
    # 1. Transcribe user's voice
    transcription = Audio::QueryTranscriber.call(audio: audio_file)

    # 2. Process with AI agent
    response = LLM::AssistantAgent.call(query: transcription.text)

    # 3. Convert response to speech
    speech = Audio::ResponseSpeaker.call(text: response.content)

    {
      transcription: transcription,
      response: response,
      speech: speech,
      total_cost: transcription.total_cost + response.total_cost + speech.total_cost
    }
  end
end

Clone this wiki locally