Audio

Audio Support

RubyLLM::Agents provides two base classes for audio operations:

Transcriber - Audio-to-text (speech recognition)
Speaker - Text-to-audio (text-to-speech / TTS)

Transcription (Audio → Text)

Convert audio files to text using speech recognition models.

Transcriber Quick Start

# Generate a transcriber
rails generate ruby_llm_agents:transcriber meeting

# app/agents/audio/meeting_transcriber.rb
class MeetingTranscriber < ApplicationTranscriber
  model "whisper-1"
  language "en"
end

# Usage
result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")
result.text           # "Hello, welcome to the meeting..."
result.audio_duration # 120.5 (seconds)
result.total_cost     # 0.012

Transcriber DSL

class MyTranscriber < ApplicationTranscriber
  # Model selection
  model "whisper-1"              # Default transcription model
  # Alternatives: "gpt-4o-transcribe", "gpt-4o-mini-transcribe"

  # Language settings
  language "en"                   # ISO 639-1 code (nil = auto-detect)

  # Output format
  output_format :text             # :text, :json, :srt, :vtt

  # Timestamp granularity
  include_timestamps :segment     # :none, :segment, :word

  # Caching
  cache_for 30.days               # Enable caching

  # Optional: Provide context for better accuracy
  def prompt
    "Technical discussion about Ruby programming"
  end

  # Optional: Post-process transcription
  def postprocess_text(text)
    text
      .gsub(/\bum\b/i, '')       # Remove filler words
      .gsub(/\buh\b/i, '')
      .squeeze(' ')
  end
end

Input Sources

# From file path
result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")

# From URL
result = Audio::MeetingTranscriber.call(audio: "https://example.com/audio.mp3")

# From File object
result = Audio::MeetingTranscriber.call(audio: File.open("meeting.mp3"))

# From binary data with format hint
result = Audio::MeetingTranscriber.call(audio: audio_blob, format: :mp3)

Reliability Configuration

class ReliableTranscriber < ApplicationTranscriber
  model "gpt-4o-transcribe"

  reliability do
    retry_on_failure max_attempts: 3
  end

  fallback_models "whisper-1", "gpt-4o-mini-transcribe"
end

TranscriptionResult

result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")

# Text content
result.text              # Full transcription text
result.segments          # Array of segments with timestamps
result.words             # Array of words with timestamps (if requested)

# Audio metadata
result.audio_duration    # Duration in seconds
result.audio_format      # Detected format
result.language          # Requested language
result.detected_language # Auto-detected language

# Execution metadata
result.model_id          # Model used
result.duration_ms       # Processing time
result.total_cost        # Cost in USD
result.started_at        # Execution start time
result.completed_at      # Execution end time
result.tenant_id         # Multi-tenant identifier

# Status
result.success?          # true if no error
result.error?            # true if failed

# Subtitle generation
result.srt               # SRT subtitle format
result.vtt               # VTT subtitle format

# Analysis helpers
result.words_per_minute        # Speaking rate
result.segment_at(30.5)        # Find segment at timestamp
result.text_between(10, 60)    # Get text in time range

Subtitle Generation

class SubtitleTranscriber < ApplicationTranscriber
  model "whisper-1"
  include_timestamps :segment
end

result = Audio::SubtitleTranscriber.call(audio: "video.mp4")

# Save as SRT (for video players)
File.write("captions.srt", result.srt)

# Save as VTT (for web video)
File.write("captions.vtt", result.vtt)

SRT Output:

1
00:00:00,000 --> 00:00:02,500
Hello everyone.

2
00:00:02,500 --> 00:00:05,000
Welcome to the meeting.

VTT Output:

WEBVTT

00:00:00.000 --> 00:00:02.500
Hello everyone.

00:00:02.500 --> 00:00:05.000
Welcome to the meeting.

Text-to-Speech (Text → Audio)

Generate natural speech audio from text.

Speaker Quick Start

# Generate a speaker
rails generate ruby_llm_agents:speaker narrator

# app/agents/audio/narrator_speaker.rb
class NarratorSpeaker < ApplicationSpeaker
  provider :openai
  model "tts-1"
  voice "nova"
end

# Usage
result = Audio::NarratorSpeaker.call(text: "Hello, world!")
result.audio          # Binary audio data
result.save_to("output.mp3")

Speaker DSL

class MyNarrator < ApplicationSpeaker
  # Provider selection
  provider :openai              # :openai, :elevenlabs

  # Model and voice
  model "tts-1-hd"              # "tts-1" for faster/cheaper
  voice "nova"                  # Voice name
  # OpenAI voices: alloy, echo, fable, nova, onyx, shimmer

  # Voice ID (ElevenLabs)
  voice_id "21m00Tcm..."        # Voice ID for cloned voices

  # Audio settings
  speed 1.0                     # Speech speed (0.25-4.0 for OpenAI)
  output_format :mp3            # :mp3, :wav, :opus, :pcm, :alaw, :ulaw
  # Or use ElevenLabs native format strings directly:
  # output_format "mp3_44100_192"  # High-bitrate MP3
  # output_format "opus_48000_64"  # Low-latency Opus
  # output_format "pcm_16000"      # Telephony PCM

  # Streaming
  streaming true                # Enable streaming mode

  # Caching
  cache_for 7.days              # Enable caching

  # Custom pronunciation lexicon
  lexicon do
    pronounce "API", "A P I"
    pronounce "SQL", "sequel"
    pronounce "RubyLLM", "ruby L L M"
    pronounce "nginx", "engine-X"
  end
end

SpeechResult

result = Audio::ArticleNarrator.call(text: "Hello!")

# Audio data
result.audio          # Binary audio data
result.save_to(path)  # Save to file
result.to_base64      # Base64 encoded
result.to_data_uri    # Data URI for web embedding

# Metadata
result.duration       # Audio duration (seconds)
result.format         # Output format (:mp3, :wav, etc.)
result.file_size      # Size in bytes
result.characters     # Input character count

# Provider info
result.provider       # :openai, :elevenlabs
result.model_id       # Model used
result.voice_id       # Voice identifier
result.voice_name     # Voice name

# Execution
result.duration_ms    # Processing time
result.total_cost     # Cost in USD
result.started_at     # Execution start time
result.completed_at   # Execution end time
result.tenant_id      # Multi-tenant identifier
result.success?       # true if no error

Streaming Audio

class StreamingNarrator < ApplicationSpeaker
  provider :elevenlabs
  voice "Rachel"
  streaming true
end

# Stream to audio player
Audio::StreamingNarrator.stream(text: "Long article...") do |chunk|
  audio_player.play(chunk.audio)
end

# Force streaming on any speaker
Audio::ArticleNarrator.stream(text: "Hello!") do |chunk|
  buffer << chunk.audio
end

ElevenLabs Configuration

class PremiumNarrator < ApplicationSpeaker
  provider :elevenlabs
  model "eleven_multilingual_v2"
  voice_id "21m00Tcm4TlvDq8ikWAM"

  # Voice settings specific to ElevenLabs
  voice_settings do
    stability 0.5            # 0-1: Lower = more expressive
    similarity_boost 0.75    # 0-1: Higher = closer to original voice
    style 0.5                # 0-1: Style exaggeration
    speaker_boost true       # Enhance speaker clarity
  end
end

Reliability Configuration

class ReliableSpeaker < ApplicationSpeaker
  provider :elevenlabs
  voice "Rachel"

  reliability do
    retry_on_failure max_attempts: 3
  end

  fallback_models "tts-1-hd", "tts-1"
end

Generators

Generate a Transcriber

# Basic transcriber
rails generate ruby_llm_agents:transcriber meeting

# With options
rails generate ruby_llm_agents:transcriber meeting --model gpt-4o-transcribe
rails generate ruby_llm_agents:transcriber meeting --language es
rails generate ruby_llm_agents:transcriber meeting --output-format srt
rails generate ruby_llm_agents:transcriber meeting --cache 30.days

This creates:

app/agents/audio/application_transcriber.rb (if not exists)
app/agents/audio/meeting_transcriber.rb

Generate a Speaker

# Basic speaker
rails generate ruby_llm_agents:speaker narrator

# With options
rails generate ruby_llm_agents:speaker narrator --provider elevenlabs
rails generate ruby_llm_agents:speaker narrator --voice alloy
rails generate ruby_llm_agents:speaker narrator --speed 1.25
rails generate ruby_llm_agents:speaker narrator --format wav
rails generate ruby_llm_agents:speaker narrator --cache 7.days

This creates:

app/agents/audio/application_speaker.rb (if not exists)
app/agents/audio/narrator_speaker.rb

Configuration

Global Defaults

# config/initializers/ruby_llm_agents.rb
RubyLLM::Agents.configure do |config|
  # Transcription defaults
  config.default_transcription_model = "whisper-1"
  config.track_transcriptions = true

  # TTS defaults
  config.default_tts_provider = :openai
  config.default_tts_model = "tts-1"
  config.default_tts_voice = "nova"
  config.track_speech = true

  # ElevenLabs (required for :elevenlabs provider)
  config.elevenlabs_api_key = ENV["ELEVENLABS_API_KEY"]
  config.elevenlabs_api_base = "https://api.elevenlabs.io"  # optional, default

  # ElevenLabs dynamic pricing (fetches model data from /v1/models API)
  config.elevenlabs_base_cost_per_1k = 0.30  # Base cost × model multiplier (default: $0.30 Pro overage)
  config.elevenlabs_models_cache_ttl = 21_600  # Cache TTL in seconds (default: 6 hours)

  # TTS pricing overrides (per 1K characters) — takes priority over API pricing
  config.tts_model_pricing = {
    "eleven_v3" => 0.24,        # custom rate
    "custom-model" => 0.10
  }
  config.default_tts_cost = 0.015  # fallback for unknown models
end

Supported Models

Transcription Models

Provider	Model	Notes
OpenAI	`whisper-1`	Default, most reliable
OpenAI	`gpt-4o-transcribe`	Faster, better accuracy
OpenAI	`gpt-4o-mini-transcribe`	Budget option

TTS Models

Provider	Model	Generation	Notes
OpenAI	`tts-1`	—	Standard quality, low latency
OpenAI	`tts-1-hd`	—	HD quality
ElevenLabs	`eleven_monolingual_v1`	v1	English only (deprecated)
ElevenLabs	`eleven_multilingual_v1`	v1	Multi-language (deprecated)
ElevenLabs	`eleven_multilingual_v2`	v2	29 languages, emotionally-aware
ElevenLabs	`eleven_turbo_v2`	v2	English, balanced speed/quality
ElevenLabs	`eleven_flash_v2`	v2	English, ultra-low latency (~75ms)
ElevenLabs	`eleven_turbo_v2_5`	v2.5	32 languages, balanced
ElevenLabs	`eleven_flash_v2_5`	v2.5	32 languages, low latency
ElevenLabs	`eleven_v3`	v3	70+ languages, most expressive

OpenAI voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer

Cost Tracking

Transcription Costs

Transcription pricing is resolved automatically via the multi-source pricing cascade. The system checks user config, the RubyLLM gem, LiteLLM, Portkey AI, and other sources to find the best available price.

Model	Typical Price
whisper-1	~$0.006 / minute
gpt-4o-transcribe	~$0.01 / minute
gpt-4o-mini-transcribe	~$0.005 / minute

Costs are calculated based on audio duration. Override pricing for any model:

RubyLLM::Agents.configure do |c|
  c.transcription_model_pricing = {
    "whisper-1" => 0.006,
    "custom-model" => 0.05
  }
end

See Pricing for full details on sources, caching, and debugging.

TTS Costs

TTS costs use a 4-tier pricing cascade:

LiteLLM JSON (auto-updating) — checked first for any model
config.tts_model_pricing — user overrides per model
ElevenLabs API — fetches character_cost_multiplier from /v1/models × elevenlabs_base_cost_per_1k
Hardcoded fallbacks — last resort if API is unreachable

Provider	Model	Price / 1K chars	Multiplier
OpenAI	tts-1	$0.015	—
OpenAI	tts-1-hd	$0.030	—
ElevenLabs	eleven_flash_v2, eleven_flash_v2_5	$0.15	0.5×
ElevenLabs	eleven_turbo_v2, eleven_turbo_v2_5	$0.15	0.5×
ElevenLabs	eleven_multilingual_v2	$0.30	1.0×
ElevenLabs	eleven_v3	$0.30	1.0×
ElevenLabs	v1 models (deprecated)	$0.30	1.0×

ElevenLabs prices are dynamically calculated: elevenlabs_base_cost_per_1k × character_cost_multiplier. The default base rate ($0.30) matches the Pro plan overage rate. Users on different plans can override:

# Starter plan
config.elevenlabs_base_cost_per_1k = 0.30

# Or override specific models directly
config.tts_model_pricing = { "eleven_v3" => 0.24 }

ElevenLabs Output Formats

ElevenLabs supports 28+ native output format strings. Use simple symbols for convenience or native strings for precise control:

# Simple symbols (convenience mapping)
MyNarrator.call(text: "Hello", format: :mp3)    # → mp3_44100_128
MyNarrator.call(text: "Hello", format: :wav)    # → wav_44100
MyNarrator.call(text: "Hello", format: :opus)   # → opus_48000_128
MyNarrator.call(text: "Hello", format: :pcm)    # → pcm_24000

# Native format strings (precise control)
MyNarrator.call(text: "Hello", format: "mp3_44100_192")  # High-bitrate MP3
MyNarrator.call(text: "Hello", format: "pcm_16000")      # Telephony PCM
MyNarrator.call(text: "Hello", format: "opus_48000_64")   # Low-bitrate Opus
MyNarrator.call(text: "Hello", format: "wav_22050")       # Low sample rate WAV

ElevenLabs Model Validation

Speaker automatically validates ElevenLabs models before making API calls:

Non-TTS models (e.g., eleven_english_sts_v2) → raises ConfigurationError
Text exceeding max characters → logs a warning (does not block)
Unsupported voice settings (e.g., style on eleven_v3) → logs a warning

Multi-Tenancy

Audio operations fully support multi-tenancy:

# Using resolver
result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")
# Automatically uses Current.tenant if configured

# Explicit tenant
result = Audio::MeetingTranscriber.call(
  audio: "meeting.mp3",
  tenant: "acme_corp"
)

# Tenant with budget limits
result = Audio::ArticleNarrator.call(
  text: "Hello world",
  tenant: {
    id: "acme_corp",
    daily_limit: 50.0,
    enforcement: :hard
  }
)

Examples

Meeting Transcription

class MeetingTranscriber < ApplicationTranscriber
  model "whisper-1"
  language "en"
  include_timestamps :segment

  def prompt
    "Business meeting with technical discussions about software"
  end

  def postprocess_text(text)
    text
      .gsub(/\bum\b/i, '')
      .gsub(/\buh\b/i, '')
      .squeeze(' ')
  end
end

result = Audio::MeetingTranscriber.call(audio: "meeting.mp3")
puts result.text
puts "Duration: #{result.audio_duration} seconds"
puts "Cost: $#{result.total_cost}"

Article Narration

class ArticleNarrator < ApplicationSpeaker
  provider :openai
  model "tts-1-hd"
  voice "nova"
  speed 1.1

  lexicon do
    pronounce "API", "A P I"
    pronounce "JSON", "jay-son"
  end
end

result = Audio::ArticleNarrator.call(text: article_content)
result.save_to("article_audio.mp3")
puts "Duration: #{result.duration} seconds"
puts "Cost: $#{result.total_cost}"

Voice Assistant Pipeline

class VoiceAssistant
  def process_query(audio_file)
    # 1. Transcribe user's voice
    transcription = Audio::QueryTranscriber.call(audio: audio_file)

    # 2. Process with AI agent
    response = LLM::AssistantAgent.call(query: transcription.text)

    # 3. Convert response to speech
    speech = Audio::ResponseSpeaker.call(text: response.content)

    {
      transcription: transcription,
      response: response,
      speech: speech,
      total_cost: transcription.total_cost + response.total_cost + speech.total_cost
    }
  end
end

Audio

Audio Support

Table of Contents

Transcription (Audio → Text)

Transcriber Quick Start

Transcriber DSL

Input Sources

Reliability Configuration

TranscriptionResult

Subtitle Generation

Text-to-Speech (Text → Audio)

Speaker Quick Start

Speaker DSL

SpeechResult

Streaming Audio

ElevenLabs Configuration

Reliability Configuration

Generators

Generate a Transcriber

Generate a Speaker

Configuration

Global Defaults

Supported Models

Transcription Models

TTS Models

Cost Tracking

Transcription Costs

TTS Costs

ElevenLabs Output Formats

ElevenLabs Model Validation

Multi-Tenancy

Examples

Meeting Transcription

Article Narration

Voice Assistant Pipeline

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!