Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
## [Unreleased]

### Added
- `base_url` and `language` fields in `[llm.stt]` config for OpenAI-compatible local whisper servers (e.g. whisper.cpp)
- `ZEPH_STT_BASE_URL` and `ZEPH_STT_LANGUAGE` environment variable overrides
- Whisper API provider now passes `language` parameter for accurate non-English transcription
- Documentation for whisper.cpp server setup with Metal acceleration on macOS
- Per-sub-provider `base_url` and `embedding_model` overrides in orchestrator config
- Full orchestrator example with cloud + local + STT in default.toml
- All previously undocumented config keys in default.toml (`agent.auto_update_check`, `llm.stt`, `llm.vision_model`, `skills.disambiguation_threshold`, `tools.filters.*`, `tools.permissions`, `a2a.auth_token`, `mcp.servers.env`)
Expand All @@ -17,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Vault age backend now falls back to default directory for key/path when `--vault-key`/`--vault-path` are not provided, matching `zeph vault init` behavior (#613)

### Changed
- Whisper STT provider no longer requires OpenAI API key when `base_url` points to a local server
- Orchestrator sub-providers now resolve `base_url` and `embedding_model` via fallback chain: per-provider, parent section, global default

## [0.11.1] - 2026-02-19
Expand Down
41 changes: 32 additions & 9 deletions crates/zeph-core/src/agent/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ struct QueuedMessage {
text: String,
received_at: Instant,
image_parts: Vec<zeph_llm::provider::MessagePart>,
raw_attachments: Vec<crate::channel::Attachment>,
}

pub(super) struct MemoryState {
Expand Down Expand Up @@ -560,20 +561,23 @@ impl<C: Channel, T: ToolExecutor> Agent<C, T> {
self.message_queue.pop_back();
continue;
}
self.enqueue_or_merge(msg.text, vec![]);
self.enqueue_or_merge(msg.text, vec![], msg.attachments);
}
}

fn enqueue_or_merge(
&mut self,
text: String,
image_parts: Vec<zeph_llm::provider::MessagePart>,
raw_attachments: Vec<crate::channel::Attachment>,
) {
let now = Instant::now();
if let Some(last) = self.message_queue.back_mut()
&& now.duration_since(last.received_at) < MESSAGE_MERGE_WINDOW
&& last.image_parts.is_empty()
&& image_parts.is_empty()
&& last.raw_attachments.is_empty()
&& raw_attachments.is_empty()
{
last.text.push('\n');
last.text.push_str(&text);
Expand All @@ -584,6 +588,7 @@ impl<C: Channel, T: ToolExecutor> Agent<C, T> {
text,
received_at: now,
image_parts,
raw_attachments,
});
} else {
tracing::warn!("message queue full, dropping message");
Expand Down Expand Up @@ -649,7 +654,15 @@ impl<C: Channel, T: ToolExecutor> Agent<C, T> {

let (text, image_parts) = if let Some(queued) = self.message_queue.pop_front() {
self.notify_queue_count().await;
(queued.text, queued.image_parts)
if queued.raw_attachments.is_empty() {
(queued.text, queued.image_parts)
} else {
let msg = crate::channel::ChannelMessage {
text: queued.text,
attachments: queued.raw_attachments,
};
self.resolve_message(msg).await
}
} else {
let incoming = tokio::select! {
result = self.channel.recv() => result?,
Expand Down Expand Up @@ -708,6 +721,12 @@ impl<C: Channel, T: ToolExecutor> Agent<C, T> {
.into_iter()
.partition(|a| a.kind == AttachmentKind::Audio);

tracing::debug!(
audio = audio_attachments.len(),
has_stt = self.stt.is_some(),
"resolve_message attachments"
);

let text = if !audio_attachments.is_empty()
&& let Some(stt) = self.stt.as_ref()
{
Expand Down Expand Up @@ -2029,7 +2048,7 @@ pub(super) mod agent_tests {
let executor = MockToolExecutor::no_tools();
let mut agent = Agent::new(provider, channel, registry, None, 5, executor);

agent.enqueue_or_merge("hello".into(), vec![]);
agent.enqueue_or_merge("hello".into(), vec![], vec![]);
assert_eq!(agent.message_queue.len(), 1);
assert_eq!(agent.message_queue[0].text, "hello");
}
Expand All @@ -2042,8 +2061,8 @@ pub(super) mod agent_tests {
let executor = MockToolExecutor::no_tools();
let mut agent = Agent::new(provider, channel, registry, None, 5, executor);

agent.enqueue_or_merge("first".into(), vec![]);
agent.enqueue_or_merge("second".into(), vec![]);
agent.enqueue_or_merge("first".into(), vec![], vec![]);
agent.enqueue_or_merge("second".into(), vec![], vec![]);
assert_eq!(agent.message_queue.len(), 1);
assert_eq!(agent.message_queue[0].text, "first\nsecond");
}
Expand All @@ -2060,8 +2079,9 @@ pub(super) mod agent_tests {
text: "old".into(),
received_at: Instant::now() - Duration::from_secs(2),
image_parts: vec![],
raw_attachments: vec![],
});
agent.enqueue_or_merge("new".into(), vec![]);
agent.enqueue_or_merge("new".into(), vec![], vec![]);
assert_eq!(agent.message_queue.len(), 2);
assert_eq!(agent.message_queue[0].text, "old");
assert_eq!(agent.message_queue[1].text, "new");
Expand All @@ -2080,9 +2100,10 @@ pub(super) mod agent_tests {
text: format!("msg{i}"),
received_at: Instant::now() - Duration::from_secs(2),
image_parts: vec![],
raw_attachments: vec![],
});
}
agent.enqueue_or_merge("overflow".into(), vec![]);
agent.enqueue_or_merge("overflow".into(), vec![], vec![]);
assert_eq!(agent.message_queue.len(), MAX_QUEUE_SIZE);
}

Expand All @@ -2094,11 +2115,11 @@ pub(super) mod agent_tests {
let executor = MockToolExecutor::no_tools();
let mut agent = Agent::new(provider, channel, registry, None, 5, executor);

agent.enqueue_or_merge("a".into(), vec![]);
agent.enqueue_or_merge("a".into(), vec![], vec![]);
// Wait past merge window
agent.message_queue.back_mut().unwrap().received_at =
Instant::now() - Duration::from_secs(1);
agent.enqueue_or_merge("b".into(), vec![]);
agent.enqueue_or_merge("b".into(), vec![], vec![]);
assert_eq!(agent.message_queue.len(), 2);

let count = agent.clear_queue();
Expand Down Expand Up @@ -2137,6 +2158,7 @@ pub(super) mod agent_tests {
text: format!("pre{i}"),
received_at: Instant::now() - Duration::from_secs(2),
image_parts: vec![],
raw_attachments: vec![],
});
}
agent.drain_channel();
Expand All @@ -2157,6 +2179,7 @@ pub(super) mod agent_tests {
text: format!("msg{i}"),
received_at: Instant::now() - Duration::from_secs(2),
image_parts: vec![],
raw_attachments: vec![],
});
}

Expand Down
3 changes: 2 additions & 1 deletion crates/zeph-core/src/bootstrap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,8 @@ pub fn create_provider(config: &Config) -> anyhow::Result<AnyProvider> {
providers,
))))
}
other => bail!("LLM provider {other} not available"),
#[cfg(not(feature = "candle"))]
ProviderKind::Candle => bail!("candle feature is not enabled"),
}
}

Expand Down
24 changes: 23 additions & 1 deletion crates/zeph-core/src/config/env.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::{Config, SttConfig, default_stt_model, default_stt_provider};
use super::{Config, SttConfig, default_stt_language, default_stt_model, default_stt_provider};

impl Config {
pub(crate) fn apply_env_overrides(&mut self) {
Expand Down Expand Up @@ -141,16 +141,38 @@ impl Config {
let stt = self.llm.stt.get_or_insert_with(|| SttConfig {
provider: default_stt_provider(),
model: default_stt_model(),
language: default_stt_language(),
base_url: None,
});
stt.provider = v;
}
if let Ok(v) = std::env::var("ZEPH_STT_MODEL") {
let stt = self.llm.stt.get_or_insert_with(|| SttConfig {
provider: default_stt_provider(),
model: default_stt_model(),
language: default_stt_language(),
base_url: None,
});
stt.model = v;
}
if let Ok(v) = std::env::var("ZEPH_STT_LANGUAGE") {
let stt = self.llm.stt.get_or_insert_with(|| SttConfig {
provider: default_stt_provider(),
model: default_stt_model(),
language: default_stt_language(),
base_url: None,
});
stt.language = v;
}
if let Ok(v) = std::env::var("ZEPH_STT_BASE_URL") {
let stt = self.llm.stt.get_or_insert_with(|| SttConfig {
provider: default_stt_provider(),
model: default_stt_model(),
language: default_stt_language(),
base_url: None,
});
stt.base_url = Some(v);
}
if let Ok(v) = std::env::var("ZEPH_AUTO_UPDATE_CHECK")
&& let Ok(enabled) = v.parse::<bool>()
{
Expand Down
8 changes: 8 additions & 0 deletions crates/zeph-core/src/config/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ pub struct SttConfig {
pub provider: String,
#[serde(default = "default_stt_model")]
pub model: String,
#[serde(default = "default_stt_language")]
pub language: String,
#[serde(default)]
pub base_url: Option<String>,
}

pub(crate) fn default_stt_provider() -> String {
Expand All @@ -145,6 +149,10 @@ pub(crate) fn default_stt_model() -> String {
"whisper-1".into()
}

pub(crate) fn default_stt_language() -> String {
"auto".into()
}

#[derive(Debug, Deserialize, Serialize)]
pub struct CloudLlmConfig {
pub model: String,
Expand Down
24 changes: 20 additions & 4 deletions crates/zeph-llm/src/candle_whisper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub struct CandleWhisperProvider {
mel_filters: Vec<f32>,
tokenizer: Arc<Tokenizer>,
device: Device,
language: String,
}

impl std::fmt::Debug for CandleWhisperProvider {
Expand Down Expand Up @@ -58,7 +59,7 @@ impl CandleWhisperProvider {
/// # Errors
///
/// Returns `LlmError::ModelLoad` if downloading or loading fails.
pub fn load(repo_id: &str, device: Option<Device>) -> Result<Self, LlmError> {
pub fn load(repo_id: &str, device: Option<Device>, language: &str) -> Result<Self, LlmError> {
let device = device.unwrap_or_else(detect_device);
tracing::info!(
repo = repo_id,
Expand Down Expand Up @@ -117,6 +118,7 @@ impl CandleWhisperProvider {
mel_filters,
tokenizer: Arc::new(tokenizer),
device,
language: language.to_string(),
})
}

Expand Down Expand Up @@ -145,8 +147,15 @@ impl CandleWhisperProvider {
.token_to_id(m::EOT_TOKEN)
.ok_or_else(|| LlmError::TranscriptionFailed("missing EOT token".into()))?;

let language_token = self.tokenizer.token_to_id("<|en|>").ok_or_else(|| {
LlmError::TranscriptionFailed("language token not found in tokenizer".into())
let lang_tag = if self.language == "auto" {
"<|en|>".to_string()
} else {
format!("<|{}|>", self.language)
};
let language_token = self.tokenizer.token_to_id(&lang_tag).ok_or_else(|| {
LlmError::TranscriptionFailed(format!(
"language token {lang_tag} not found in tokenizer"
))
})?;

let mut model = self
Expand Down Expand Up @@ -189,7 +198,14 @@ impl CandleWhisperProvider {

Ok(Transcription {
text: text.trim().to_string(),
language: Some("en".into()),
language: Some(
if self.language == "auto" {
"en"
} else {
&self.language
}
.into(),
),
duration_secs: Some(pcm.len() as f32 / m::SAMPLE_RATE as f32),
})
}
Expand Down
16 changes: 15 additions & 1 deletion crates/zeph-llm/src/whisper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pub struct WhisperProvider {
api_key: String,
base_url: String,
model: String,
language: Option<String>,
}

impl WhisperProvider {
Expand All @@ -24,8 +25,18 @@ impl WhisperProvider {
api_key: api_key.into(),
base_url: base_url.into(),
model: model.into(),
language: None,
}
}

#[must_use]
pub fn with_language(mut self, language: impl Into<String>) -> Self {
let lang = language.into();
if lang != "auto" && !lang.is_empty() {
self.language = Some(lang);
}
self
}
}

impl std::fmt::Debug for WhisperProvider {
Expand Down Expand Up @@ -56,10 +67,13 @@ impl SpeechToText for WhisperProvider {
.mime_str("application/octet-stream")
.map_err(|e| LlmError::TranscriptionFailed(e.to_string()))?;

let form = reqwest::multipart::Form::new()
let mut form = reqwest::multipart::Form::new()
.text("model", self.model.clone())
.text("response_format", "json")
.part("file", part);
if let Some(ref lang) = self.language {
form = form.text("language", lang.clone());
}

let url = format!(
"{}/audio/transcriptions",
Expand Down
43 changes: 42 additions & 1 deletion docs/src/advanced/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,56 @@ provider = "whisper"
model = "whisper-1"
```

The Whisper provider inherits the OpenAI API key from `[llm.openai]` or `ZEPH_OPENAI_API_KEY`. Environment variable overrides: `ZEPH_STT_PROVIDER`, `ZEPH_STT_MODEL`.
When `base_url` is omitted, the provider uses the OpenAI API key from `[llm.openai]` or `ZEPH_OPENAI_API_KEY`. Set `base_url` to point at any OpenAI-compatible server (no API key required for local servers). The `language` field accepts an [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) code (e.g. `ru`, `en`, `de`) or `auto` for automatic detection.

Environment variable overrides: `ZEPH_STT_PROVIDER`, `ZEPH_STT_MODEL`, `ZEPH_STT_LANGUAGE`, `ZEPH_STT_BASE_URL`.

### Backends

| Backend | Provider | Feature | Description |
|---------|----------|---------|-------------|
| OpenAI Whisper API | `whisper` | `stt` | Cloud-based transcription |
| OpenAI-compatible server | `whisper` | `stt` | Any local server with `/v1/audio/transcriptions` |
| Local Whisper | `candle-whisper` | `candle` | Fully offline via candle |

### Local Whisper Server (whisper.cpp)

The recommended setup for local speech-to-text. Uses Metal acceleration on Apple Silicon and handles all audio formats (including Telegram OGG/Opus) server-side.

**Install and run:**

```bash
brew install whisper-cpp

# Download a model
curl -L -o ~/.cache/whisper/ggml-large-v3.bin \
https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin

# Start the server
whisper-server \
--model ~/.cache/whisper/ggml-large-v3.bin \
--host 127.0.0.1 --port 8080 \
--inference-path "/v1/audio/transcriptions" \
--convert
```

**Configure Zeph:**

```toml
[llm.stt]
provider = "whisper"
model = "large-v3"
base_url = "http://127.0.0.1:8080/v1"
language = "en" # ISO-639-1 code or "auto"
```

| Model | Parameters | Disk | Notes |
|-------|------------|------|-------|
| `ggml-tiny` | 39M | ~75 MB | Fastest, lower accuracy |
| `ggml-base` | 74M | ~142 MB | Good balance |
| `ggml-small` | 244M | ~466 MB | Better accuracy |
| `ggml-large-v3` | 1.5B | ~2.9 GB | Best accuracy |

### Local Whisper (Candle)

```bash
Expand Down
Loading
Loading