bug-ops · bug-ops · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ## [Unreleased]
 
 ### Added
+- `TokenCounter` in `zeph-memory` — accurate token counting with `tiktoken-rs` cl100k_base, replacing `chars/4` heuristic (#789)
+- DashMap-backed token cache (10k cap) for amortized O(1) lookups
+- OpenAI tool schema token formula for precise context budget allocation
+- Input size guard (64KB) on token counting to prevent cache pollution from oversized input
+- Graceful fallback to `chars/4` when tiktoken tokenizer is unavailable
 - Configurable tool response offload — `OverflowConfig` with threshold (default 50k chars), retention (7 days), optional custom dir (#791)
 - `[tools.overflow]` section in `config.toml` for offload configuration
 - Security hardening: path canonicalization, symlink-safe cleanup, 0o600 file permissions on Unix

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,6 +27,7 @@ candle-core = { version = "0.9", default-features = false }
 candle-nn = { version = "0.9", default-features = false }
 candle-transformers = { version = "0.9", default-features = false }
 chrono = { version = "0.4", default-features = false, features = ["std"] }
+dashmap = "6.1"
 clap = { version = "4.5", features = ["derive"] }
 criterion = "0.8"
 cron = "0.15"
@@ -68,6 +69,7 @@ subtle = "2.6"
 symphonia = { version = "0.5.5", default-features = false, features = ["mp3", "ogg", "wav", "flac", "pcm"] }
 teloxide = { version = "0.17", default-features = false, features = ["rustls", "ctrlc_handler", "macros"] }
 tempfile = "3"
+tiktoken-rs = "0.9"
 testcontainers = "0.27"
 thiserror = "2.0"
 tokenizers = { version = "0.22", default-features = false, features = ["fancy-regex"] }

diff --git a/README.md b/README.md
@@ -24,6 +24,7 @@ Most AI agent frameworks dump every tool description, skill, and raw output into
 - **Semantic skill selection** — embeds skills as vectors, retrieves only top-K relevant per query instead of injecting all
 - **Smart output filtering** — command-aware filters strip 70-99% of noise before context injection; oversized responses offloaded to filesystem
 - **Two-tier context pruning** — selective eviction + adaptive chunked compaction with parallel summarization keeps the window clean
+- **Accurate token counting** — tiktoken-based cl100k_base tokenizer with DashMap cache replaces chars/4 heuristic
 - **Proportional budget allocation** — context space distributed by purpose, not arrival order
 
 ## Installation

diff --git a/crates/zeph-core/README.md b/crates/zeph-core/README.md
@@ -24,7 +24,7 @@ Core orchestration crate for the Zeph agent. Manages the main agent loop, bootst
 | `bootstrap` | `AppBuilder` — fluent builder for application startup |
 | `channel` | `Channel` trait defining I/O adapters; `LoopbackChannel` / `LoopbackHandle` for headless daemon I/O (`LoopbackHandle` exposes `cancel_signal: Arc<Notify>` for session cancellation); `Attachment` / `AttachmentKind` for multimodal inputs |
 | `config` | TOML config with `ZEPH_*` env overrides; typed `ConfigError` (Io, Parse, Validation, Vault) |
-| `context` | LLM context assembly from history, skills, memory; adaptive chunked compaction with parallel summarization |
+| `context` | LLM context assembly from history, skills, memory; adaptive chunked compaction with parallel summarization; uses shared `Arc<TokenCounter>` for accurate tiktoken-based budget tracking |
 | `cost` | Token cost tracking and budgeting |
 | `daemon` | Background daemon mode with PID file lifecycle (optional feature) |
 | `metrics` | Runtime metrics collection |
@@ -53,7 +53,7 @@ Key `MemoryConfig` fields (TOML section `[memory]`):
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
 | `vector_backend` | `"qdrant"` / `"sqlite"` | `"qdrant"` | Vector search backend |
-| `token_safety_margin` | f32 | `1.0` | Safety multiplier for token budget estimation (validated: must be >= 1.0) |
+| `token_safety_margin` | f32 | `1.0` | Safety multiplier for tiktoken-based token budget (validated: must be >= 1.0) |
 | `redact_credentials` | bool | `true` | Scrub secrets and paths before LLM context injection |
 | `autosave_assistant` | bool | `false` | Persist assistant responses to semantic memory automatically |
 | `autosave_min_length` | usize | `20` | Minimum response length (chars) to trigger autosave |

diff --git a/crates/zeph-core/benches/context_building.rs b/crates/zeph-core/benches/context_building.rs
@@ -1,6 +1,6 @@
 use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
 use std::hint::black_box;
-use zeph_memory::estimate_tokens;
+use zeph_memory::TokenCounter;
 
 fn generate_messages(count: usize, avg_len: usize) -> Vec<String> {
     let base = "This is a simulated message with typical content for an AI conversation. ";
@@ -13,13 +13,14 @@ fn generate_messages(count: usize, avg_len: usize) -> Vec<String> {
 }
 
 fn should_compact_check(c: &mut Criterion) {
+    let counter = TokenCounter::new();
     let mut group = c.benchmark_group("should_compact");
 
     for count in [20, 50, 100] {
         let messages = generate_messages(count, 200);
         group.bench_with_input(BenchmarkId::new("messages", count), &messages, |b, msgs| {
             b.iter(|| {
-                let total: usize = msgs.iter().map(|m| estimate_tokens(m)).sum();
+                let total: usize = msgs.iter().map(|m| counter.count_tokens(m)).sum();
                 black_box(total > 4000)
             });
         });
@@ -29,6 +30,7 @@ fn should_compact_check(c: &mut Criterion) {
 }
 
 fn trim_budget_scan(c: &mut Criterion) {
+    let counter = TokenCounter::new();
     let mut group = c.benchmark_group("trim_budget_scan");
 
     for count in [20, 50, 100] {
@@ -40,7 +42,7 @@ fn trim_budget_scan(c: &mut Criterion) {
                 let mut total = 0usize;
                 let mut keep_from = msgs.len();
                 for i in (0..msgs.len()).rev() {
-                    let tokens = estimate_tokens(&msgs[i]);
+                    let tokens = counter.count_tokens(&msgs[i]);
                     if total + tokens > budget {
                         break;
                     }