reflectionai · arnaudstiegler · May 6, 2025 · May 6, 2025 · May 6, 2025 · May 6, 2025
diff --git a/codex-cli/package-lock.json b/codex-cli/package-lock.json
diff --git a/codex-cli/src/cli.tsx b/codex-cli/src/cli.tsx
@@ -27,6 +27,34 @@ import {
 import { createInputItem } from "./utils/input-utils";
 import { initLogger } from "./utils/logger/log";
 import { isModelSupportedForResponses } from "./utils/model-utils.js";
+import { approximateTokensUsed } from "./utils/approximate-tokens-used.js";
+
+// ── Pricing table for cost estimation (USD per token) ───────────────────────
+type TokenRates = { input: number; cachedInput: number; output: number };
+const detailedPriceMap: Record<string, TokenRates> = {
+  // OpenAI "o-series" experimental
+  "o3":        { input: 10/1e6, cachedInput: 2.5/1e6, output: 40/1e6 },
+  "o4-mini":   { input: 1.1/1e6, cachedInput: 0.275/1e6, output: 4.4/1e6 },
+  // GPT-4.1 family
+  "gpt-4.1-nano": { input: 0.1/1e6, cachedInput: 0.025/1e6, output: 0.4/1e6 },
+  "gpt-4.1-mini": { input: 0.4/1e6, cachedInput: 0.1/1e6,   output: 1.6/1e6 },
+  "gpt-4.1":      { input: 2/1e6,   cachedInput: 0.5/1e6,   output: 8/1e6 },
+  // GPT-4o family
+  "gpt-4o-mini":  { input: 0.6/1e6, cachedInput: 0.3/1e6,   output: 2.4/1e6 },
+  "gpt-4o":       { input: 5/1e6,   cachedInput: 2.5/1e6,   output: 20/1e6 },
+};
+/** Estimate cost in USD given model, token counts, and cache flag */
+function estimateCost(
+  model: string,
+  inputTokens: number,
+  outputTokens: number,
+  useCachedPrompt = false
+): number {
+  const rates = detailedPriceMap[model.toLowerCase()];
+  if (!rates) return 0;
+  const inRate  = useCachedPrompt ? rates.cachedInput : rates.input;
+  return inputTokens * inRate + outputTokens * rates.output;
+}
 import { parseToolCall } from "./utils/parsers";
 import { onExit, setInkRenderer } from "./utils/terminal";
 import chalk from "chalk";
@@ -509,6 +537,8 @@ async function runQuietMode({
   additionalWritableRoots: ReadonlyArray<string>;
   config: AppConfig;
 }): Promise<void> {
+  // Collect all response items to compute output token count
+  const outputItems: Array<ResponseItem> = [];
   const agent = new AgentLoop({
     model: config.model,
     config: config,
@@ -520,6 +550,8 @@ async function runQuietMode({
     onItem: (item: ResponseItem) => {
       // eslint-disable-next-line no-console
       console.log(formatResponseItemForQuietMode(item));
+      // track for cost estimation
+      outputItems.push(item);
     },
     onLoading: () => {
       /* intentionally ignored in quiet mode */
@@ -541,6 +573,19 @@ async function runQuietMode({
 
   const inputItem = await createInputItem(prompt, imagePaths);
   await agent.run([inputItem]);
+  // After streaming completes, estimate and print cost
+  try {
+    const inputTokens  = Math.ceil(prompt.length / 4);
+    const outputTokens = approximateTokensUsed(outputItems);
+    const cost = estimateCost(config.model, inputTokens, outputTokens);
+    // eslint-disable-next-line no-console
+    console.log(
+      `\nCost estimate (model=${config.model}): $${cost.toFixed(6)} ` +
+      `(${inputTokens} in • ${outputTokens} out)`
+    );
+  } catch {
+    // ignore errors in cost computation
+  }
 }
 
 const exit = () => {

diff --git a/codex-rs/core/Cargo.toml b/codex-rs/core/Cargo.toml
@@ -12,7 +12,7 @@ anyhow = "1"
 async-channel = "2.3.1"
 base64 = "0.21"
 bytes = "1.10.1"
-clap = { version = "4", features = ["derive", "wrap_help"], optional = true }
+clap = { version = "4", features = ["derive", "wrap_help"] }
 codex-apply-patch = { path = "../apply-patch" }
 dirs = "6"
 env-flags = "0.1.1"
@@ -59,4 +59,4 @@ wiremock = "0.6"
 default = []
 
 # Separate feature so that `clap` is not a mandatory dependency.
-cli = ["clap"]
+cli = []
diff --git a/codex-rs/core/src/client.rs b/codex-rs/core/src/client.rs
@@ -44,10 +44,32 @@ pub struct Prompt {
     pub store: bool,
 }
 
+/// Token usage breakdown from the Responses API (when present).
+#[derive(Debug, Clone, serde::Deserialize)]
+pub struct UsageBreakdown {
+    pub input_tokens: Option<i64>,
+    #[serde(default)]
+    pub input_tokens_details: Option<InputTokensDetails>,
+    pub output_tokens: Option<i64>,
+    pub total_tokens: Option<i64>,
+}
+
+/// Extra details about input tokens (e.g., cached tokens).
+#[derive(Debug, Clone, serde::Deserialize)]
+pub struct InputTokensDetails {
+    pub cached_tokens: Option<i64>,
+}
+
+/// Events emitted by the streaming Responses API.
 #[derive(Debug)]
 pub enum ResponseEvent {
+    /// A single content item is complete.
     OutputItemDone(ResponseItem),
-    Completed { response_id: String },
+    /// The full response is complete: `response_id` and optional usage.
+    Completed {
+        response_id: String,
+        usage: Option<UsageBreakdown>,
+    },
 }
 
 #[derive(Debug, Serialize)]
@@ -141,6 +163,10 @@ impl ModelClient {
         let client = reqwest::Client::new();
         Self { model, client }
     }
+    /// Return the model identifier.
+    pub fn model(&self) -> &str {
+        &self.model
+    }
 
     pub async fn stream(&mut self, prompt: &Prompt) -> Result<ResponseStream> {
         if let Some(path) = &*CODEX_RS_SSE_FIXTURE {
@@ -243,9 +269,14 @@ struct SseEvent {
     item: Option<Value>,
 }
 
+/// Payload for a completed response, including optional token usage.
 #[derive(Debug, Deserialize)]
 struct ResponseCompleted {
+    /// The response ID for retrieval or pagination.
     id: String,
+    /// Optional token usage breakdown provided by the API.
+    #[serde(default)]
+    usage: Option<UsageBreakdown>,
 }
 
 async fn process_sse<S>(stream: S, tx_event: mpsc::Sender<Result<ResponseEvent>>)
@@ -257,7 +288,9 @@ where
     // If the stream stays completely silent for an extended period treat it as disconnected.
     let idle_timeout = *OPENAI_STREAM_IDLE_TIMEOUT_MS;
     // The response id returned from the "complete" message.
-    let mut response_id = None;
+    let mut response_id: Option<String> = None;
+    // Capture real token usage when `response.completed` includes it.
+    let mut usage: Option<UsageBreakdown> = None;
 
     loop {
         let sse = match timeout(idle_timeout, stream.next()).await {
@@ -269,18 +302,16 @@ where
                 return;
             }
             Ok(None) => {
-                match response_id {
-                    Some(response_id) => {
-                        let event = ResponseEvent::Completed { response_id };
-                        let _ = tx_event.send(Ok(event)).await;
-                    }
-                    None => {
-                        let _ = tx_event
-                            .send(Err(CodexErr::Stream(
-                                "stream closed before response.completed".into(),
-                            )))
-                            .await;
-                    }
+                if let Some(response_id) = response_id.clone() {
+                    let event = ResponseEvent::Completed { response_id, usage: usage.clone() };
+                    let _ = tx_event.send(Ok(event)).await;
+                } else {
+                    // No response ID available: treat as stream error
+                    let _ = tx_event
+                        .send(Err(CodexErr::Stream(
+                            "stream closed before response.completed".into(),
+                        )))
+                        .await;
                 }
                 return;
             }
@@ -337,7 +368,8 @@ where
                 if let Some(resp_val) = event.response {
                     match serde_json::from_value::<ResponseCompleted>(resp_val) {
                         Ok(r) => {
-                            response_id = Some(r.id);
+                            response_id = Some(r.id.clone());
+                            usage = r.usage;
                         }
                         Err(e) => {
                             debug!("failed to parse ResponseCompleted: {e}");

diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs
@@ -57,6 +57,29 @@ use crate::safety::assess_patch_safety;
 use crate::safety::SafetyCheck;
 use crate::user_notification::UserNotification;
 use crate::util::backoff;
+/// Per-token pricing rates (USD) for input, cached input, and output.
+struct TokenRates {
+    input: f64,
+    cached_input: f64,
+    output: f64,
+}
+
+/// Return the per-token rates for a model, or None if unknown.
+fn get_token_rates(model: &str) -> Option<TokenRates> {
+    match model.to_lowercase().as_str() {
+        // OpenAI o-series experimental
+        "o3" => Some(TokenRates { input: 10.0/1_000_000.0, cached_input: 2.5/1_000_000.0, output: 40.0/1_000_000.0 }),
+        "o4-mini" => Some(TokenRates { input: 1.1/1_000_000.0, cached_input: 0.275/1_000_000.0, output: 4.4/1_000_000.0 }),
+        // GPT-4.1 family
+        "gpt-4.1-nano" => Some(TokenRates { input: 0.1/1_000_000.0, cached_input: 0.025/1_000_000.0, output: 0.4/1_000_000.0 }),
+        "gpt-4.1-mini" => Some(TokenRates { input: 0.4/1_000_000.0, cached_input: 0.1/1_000_000.0, output: 1.6/1_000_000.0 }),
+        "gpt-4.1"      => Some(TokenRates { input: 2.0/1_000_000.0, cached_input: 0.5/1_000_000.0, output: 8.0/1_000_000.0 }),
+        // GPT-4o family
+        "gpt-4o-mini" => Some(TokenRates { input: 0.6/1_000_000.0, cached_input: 0.3/1_000_000.0, output: 2.4/1_000_000.0 }),
+        "gpt-4o"      => Some(TokenRates { input: 5.0/1_000_000.0, cached_input: 2.5/1_000_000.0, output: 20.0/1_000_000.0 }),
+        _ => None,
+    }
+}
 use crate::zdr_transcript::ZdrTranscript;
 
 /// The high-level interface to the Codex system.
@@ -631,6 +654,18 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
     if input.is_empty() {
         return;
     }
+    // Approximate token count for input: assume ~4 characters per token
+    let initial_input_items = input.clone();
+    let input_char_count: usize = initial_input_items
+        .iter()
+        .map(|item| match item {
+            InputItem::Text { text } => text.len(),
+            _ => 0,
+        })
+        .sum();
+    let input_tokens: usize = (input_char_count + 3) / 4;
+    // Track output character count for token approximation
+    let mut output_char_count: usize = 0;
     let event = Event {
         id: sub_id.clone(),
         msg: EventMsg::TaskStarted,
@@ -640,6 +675,8 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
     }
 
     let mut pending_response_input: Vec<ResponseInputItem> = vec![ResponseInputItem::from(input)];
+    // Track exact usage when provided by the API
+    let mut final_usage: Option<crate::client::UsageBreakdown> = None;
     loop {
         let mut net_new_turn_input = pending_response_input
             .drain(..)
@@ -677,7 +714,11 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
             })
             .collect();
         match run_turn(&sess, sub_id.clone(), turn_input).await {
-            Ok(turn_output) => {
+            Ok((turn_output, usage_opt)) => {
+                // Capture usage for cost computation when available
+                if usage_opt.is_some() {
+                    final_usage = usage_opt;
+                }
                 let (items, responses): (Vec<_>, Vec<_>) = turn_output
                     .into_iter()
                     .map(|p| (p.item, p.response))
@@ -687,6 +728,25 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
                     .flatten()
                     .collect::<Vec<ResponseInputItem>>();
                 let last_assistant_message = get_last_assistant_message_from_turn(&items);
+                // Accumulate output characters for token approximation
+                for item in &items {
+                    match item {
+                        ResponseItem::Message { role, content } if role == "assistant" => {
+                            for c in content {
+                                if let ContentItem::OutputText { text } = c {
+                                    output_char_count += text.len();
+                                }
+                            }
+                        }
+                        ResponseItem::FunctionCall { name, arguments, .. } => {
+                            output_char_count += name.len() + arguments.len();
+                        }
+                        ResponseItem::FunctionCallOutput { output, .. } => {
+                            output_char_count += output.len();
+                        }
+                        _ => {}
+                    }
+                }
 
                 // Only attempt to take the lock if there is something to record.
                 if !items.is_empty() {
@@ -721,9 +781,35 @@ async fn run_task(sess: Arc<Session>, sub_id: String, input: Vec<InputItem>) {
         }
     }
     sess.remove_task(&sub_id);
+    // Approximate token count for output: assume ~4 characters per token
+    let output_tokens: usize = (output_char_count + 3) / 4;
+    // Compute cost: prefer exact API usage when available
+    let model_name = sess.client.model();
+    // Compute cost: prefer exact API usage when available, fallback to heuristic
+    let precise = final_usage.is_some();
+    let cost = if let Some(rates) = get_token_rates(model_name) {
+        if precise {
+            let u = final_usage.unwrap();
+            let input_toks = u.input_tokens.unwrap_or(input_tokens as i64) as usize;
+            let cached_toks = u
+                .input_tokens_details
+                .and_then(|d| d.cached_tokens)
+                .unwrap_or(0) as usize;
+            let non_cached = input_toks.saturating_sub(cached_toks);
+            let out_toks = u.output_tokens.unwrap_or(output_tokens as i64) as usize;
+            (non_cached as f64) * rates.input
+                + (cached_toks as f64) * rates.cached_input
+                + (out_toks as f64) * rates.output
+        } else {
+            // Fallback heuristic
+            (input_tokens as f64) * rates.input + (output_tokens as f64) * rates.output
+        }
+    } else {
+        0.0
+    };
     let event = Event {
         id: sub_id,
-        msg: EventMsg::TaskComplete,
+        msg: EventMsg::TaskComplete { input_tokens, output_tokens, cost, precise },
     };
     sess.tx_event.send(event).await.ok();
 }
@@ -732,7 +818,7 @@ async fn run_turn(
     sess: &Session,
     sub_id: String,
     input: Vec<ResponseItem>,
-) -> CodexResult<Vec<ProcessedResponseItem>> {
+) -> CodexResult<(Vec<ProcessedResponseItem>, Option<crate::client::UsageBreakdown>)> {
     // Decide whether to use server-side storage (previous_response_id) or disable it
     let (prev_id, store, is_first_turn) = {
         let state = sess.state.lock().unwrap();
@@ -763,7 +849,7 @@ async fn run_turn(
     let mut retries = 0;
     loop {
         match try_run_turn(sess, &sub_id, &prompt).await {
-            Ok(output) => return Ok(output),
+            Ok((output, usage_opt)) => return Ok((output, usage_opt)),
             Err(CodexErr::Interrupted) => return Err(CodexErr::Interrupted),
             Err(e) => {
                 if retries < *OPENAI_STREAM_MAX_RETRIES {
@@ -808,14 +894,20 @@ async fn try_run_turn(
     sess: &Session,
     sub_id: &str,
     prompt: &Prompt,
-) -> CodexResult<Vec<ProcessedResponseItem>> {
+) -> CodexResult<(Vec<ProcessedResponseItem>, Option<crate::client::UsageBreakdown>)> {
     let mut stream = sess.client.clone().stream(prompt).await?;
 
     // Buffer all the incoming messages from the stream first, then execute them.
     // If we execute a function call in the middle of handling the stream, it can time out.
     let mut input = Vec::new();
+    // Collect all events, capturing the final usage if provided
+    let mut usage: Option<crate::client::UsageBreakdown> = None;
     while let Some(event) = stream.next().await {
-        input.push(event?);
+        let ev = event?;
+        if let crate::client::ResponseEvent::Completed { usage: u, .. } = &ev {
+            usage = u.clone();
+        }
+        input.push(ev);
     }
 
     let mut output = Vec::new();
@@ -825,14 +917,14 @@ async fn try_run_turn(
                 let response = handle_response_item(sess, sub_id, item.clone()).await?;
                 output.push(ProcessedResponseItem { item, response });
             }
-            ResponseEvent::Completed { response_id } => {
+            ResponseEvent::Completed { response_id, usage: _ } => {
                 let mut state = sess.state.lock().unwrap();
                 state.previous_response_id = Some(response_id);
                 break;
             }
         }
     }
-    Ok(output)
+    Ok((output, usage))
 }
 
 async fn handle_response_item(

diff --git a/codex-rs/core/src/lib.rs b/codex-rs/core/src/lib.rs
@@ -24,9 +24,6 @@ mod zdr_transcript;
 
 pub use codex::Codex;
 
-#[cfg(feature = "cli")]
 mod approval_mode_cli_arg;
-#[cfg(feature = "cli")]
 pub use approval_mode_cli_arg::ApprovalModeCliArg;
-#[cfg(feature = "cli")]
 pub use approval_mode_cli_arg::SandboxPermissionOption;