feat(xtask): 响应填写 usage 字段

YdrMaster · YdrMaster · commit d7dc4dc6c98e · 2025-07-29T17:40:40.000+08:00
Signed-off-by: YdrMaster &lt;ydrml@hotmail.com&gt;
diff --git a/xtask/src/service/blacklist_integration_test.rs b/xtask/src/service/blacklist_integration_test.rs
@@ -282,9 +282,7 @@ fn test_blacklist_configuration() {
         let lower_word = word.to_lowercase();
         assert!(
             blacklist.iter().any(|bw| bw == &lower_word),
-            "Blacklist should contain '{}' (lowercase: '{}')",
-            word,
-            lower_word
+            "Blacklist should contain '{word}' (lowercase: '{lower_word}')"
         );
     }
 
@@ -295,10 +293,7 @@ fn test_blacklist_configuration() {
     let max_length = word_lengths.iter().max().unwrap();
     let min_length = word_lengths.iter().min().unwrap();
 
-    info!(
-        "Blacklist word length range: {} to {} characters",
-        min_length, max_length
-    );
+    info!("Blacklist word length range: {min_length} to {max_length} characters");
     assert!(
         *max_length > 10,
         "Should have words longer than 10 characters for suffix optimization test"
@@ -378,7 +373,7 @@ blacklist = [
 "#;
 
     info!("Example TOML configuration:");
-    info!("{}", toml_config);
+    info!("{toml_config}");
 
     // In practice, you'd parse this with:
     // let config: ModelConfig = toml::from_str(toml_config).unwrap();
diff --git a/xtask/src/service/client.rs b/xtask/src/service/client.rs
@@ -358,7 +358,7 @@ fn test_blacklisted_check() {
             let normal_prompt = "Tell me a story about a cat";
             let req_body_normal = requset_body_chat(normal_prompt);
 
-            info!("Sending normal request: {}", normal_prompt);
+            info!("Sending normal request: {normal_prompt}");
             let normal_result =
                 send_single_request(port, &client, &headers, req_body_normal, Some(1)).await;
 
diff --git a/xtask/src/service/mod.rs b/xtask/src/service/mod.rs
@@ -8,9 +8,7 @@ mod response;
 use crate::{
     parse_gpus,
     service::{
-        openai::{
-            chat_completion_response, chat_completion_response_stream, create_completion_response,
-        },
+        openai::{chat_completion_response, chat_completion_response_stream, completion_response},
         response::text_stream,
     },
 };
@@ -226,16 +224,14 @@ impl HyperService<Request<Incoming>> for App {
                         return Ok(text_stream(UnboundedReceiverStream::new(receiver).map(
                             move |output| {
                                 let response = match output {
-                                    model::Output::Text { content, .. } => {
-                                        create_completion_response(
-                                            id,
-                                            created,
-                                            model_name.clone(),
-                                            content,
-                                            None,
-                                        )
-                                    }
-                                    model::Output::Finish(reason) => create_completion_response(
+                                    model::Output::Text { content, .. } => completion_response(
+                                        id,
+                                        created,
+                                        model_name.clone(),
+                                        content,
+                                        None,
+                                    ),
+                                    model::Output::Finish { reason, .. } => completion_response(
                                         id,
                                         created,
                                         model_name.clone(),
@@ -257,14 +253,13 @@ impl HyperService<Request<Incoming>> for App {
                                 think_.push_str(&think);
                                 content_.push_str(&content);
                             }
-                            model::Output::Finish(reason) => {
+                            model::Output::Finish { reason, .. } => {
                                 assert!(reason_.replace(reason).is_none())
                             }
                         }
                     }
 
-                    let response =
-                        create_completion_response(id, created, model_name, content_, reason_);
+                    let response = completion_response(id, created, model_name, content_, reason_);
                     Ok(json(response))
                 })
             }
@@ -314,7 +309,7 @@ impl HyperService<Request<Incoming>> for App {
                                             None,
                                         )
                                     }
-                                    model::Output::Finish(reason) => {
+                                    model::Output::Finish { reason, .. } => {
                                         chat_completion_response_stream(
                                             id,
                                             created,
@@ -333,14 +328,16 @@ impl HyperService<Request<Incoming>> for App {
                     let mut think_ = String::new();
                     let mut content_ = String::new();
                     let mut reason_ = None;
+                    let mut num_tokens_ = [0, 0];
                     while let Some(output) = receiver.recv().await {
                         match output {
                             model::Output::Text { think, content } => {
                                 think_.push_str(&think);
                                 content_.push_str(&content);
                             }
-                            model::Output::Finish(reason) => {
-                                assert!(reason_.replace(reason).is_none())
+                            model::Output::Finish { reason, num_tokens } => {
+                                assert!(reason_.replace(reason).is_none());
+                                num_tokens_ = num_tokens
                             }
                         }
                     }
@@ -351,6 +348,7 @@ impl HyperService<Request<Incoming>> for App {
                         model_name,
                         Some(think_).filter(|s| !s.is_empty()),
                         Some(content_).filter(|s| !s.is_empty()),
+                        num_tokens_,
                         reason_,
                     );
                     Ok(json(response))
diff --git a/xtask/src/service/model.rs b/xtask/src/service/model.rs
@@ -29,18 +29,38 @@ pub(super) struct Model {
 }
 
 pub(super) enum Output {
-    Text { think: String, content: String },
-    Finish(FinishReason),
+    Text {
+        think: String,
+        content: String,
+    },
+    Finish {
+        reason: FinishReason,
+        num_tokens: [usize; 2],
+    },
 }
 
 struct SessionInfo {
     sender: UnboundedSender<Output>,
     buf: TextBuf,
     think: bool,
     tokens: Vec<utok>,
+    prompt_tokens: usize,
     accumulated_content: String, // Track all generated content for blacklist detection
 }
 
+impl SessionInfo {
+    fn new(sender: UnboundedSender<Output>, tokens: Vec<utok>) -> Self {
+        Self {
+            buf: TextBuf::new(),
+            think: false,
+            prompt_tokens: tokens.len(),
+            accumulated_content: String::new(),
+            sender,
+            tokens,
+        }
+    }
+}
+
 impl Model {
     pub fn new(config: ModelConfig, use_cuda_graph: bool) -> (Self, Service) {
         let ModelConfig {
@@ -181,7 +201,10 @@ impl Model {
                     // Send finish signal
                     if session_info
                         .sender
-                        .send(Output::Finish(FinishReason::Stop))
+                        .send(Output::Finish {
+                            reason: FinishReason::Stop,
+                            num_tokens: [session_info.prompt_tokens, session_info.tokens.len()],
+                        })
                         .is_err()
                     {
                         info!("{session_id:?} 客户端连接已关闭");
@@ -202,8 +225,13 @@ impl Model {
             // 处理会话结束
             if !sessions.is_empty() {
                 for (session, reason) in sessions {
-                    let SessionInfo { tokens, sender, .. } =
-                        sessions_guard.remove(&session.id).unwrap();
+                    let SessionInfo {
+                        tokens,
+                        sender,
+                        prompt_tokens,
+                        ..
+                    } = sessions_guard.remove(&session.id).unwrap();
+                    let num_tokens = [prompt_tokens, tokens.len()];
                     let reason = match reason {
                         ReturnReason::Finish => {
                             // 正常完成，插回 cache
@@ -221,7 +249,7 @@ impl Model {
                     };
 
                     sender
-                        .send(Output::Finish(reason))
+                        .send(Output::Finish { reason, num_tokens })
                         .unwrap_or_else(|_| info!("{:?} 发送正常完成失败", session.id));
                 }
             }
@@ -298,18 +326,12 @@ impl Model {
             max_tokens,
         );
 
-        let session_info = SessionInfo {
-            sender,
-            tokens,
-            buf: TextBuf::new(),
-            think: false,
-            accumulated_content: String::new(),
-        };
+        let session_info = SessionInfo::new(sender, tokens);
         assert!(
             self.sessions
                 .lock()
                 .unwrap()
-                .insert(id, session_info,)
+                .insert(id, session_info)
                 .is_none()
         );
 
@@ -360,18 +382,12 @@ impl Model {
             max_tokens,
         );
 
-        let session_info = SessionInfo {
-            sender,
-            tokens,
-            buf: TextBuf::new(),
-            think: false,
-            accumulated_content: String::new(),
-        };
+        let session_info = SessionInfo::new(sender, tokens);
         assert!(
             self.sessions
                 .lock()
                 .unwrap()
-                .insert(id, session_info,)
+                .insert(id, session_info)
                 .is_none()
         );
 
diff --git a/xtask/src/service/openai.rs b/xtask/src/service/openai.rs
@@ -1,9 +1,10 @@
 ﻿use hyper::Method;
 use openai_struct::{
-    ChatCompletionResponseMessage, ChatCompletionStreamResponseDelta, CreateChatCompletionResponse,
-    CreateChatCompletionResponseChoices, CreateChatCompletionStreamResponse,
-    CreateChatCompletionStreamResponseChoices, CreateCompletionResponse,
-    CreateCompletionResponseChoices, CreateCompletionResponseLogprobs, FinishReason, Model,
+    ChatCompletionResponseMessage, ChatCompletionStreamResponseDelta, CompletionUsage,
+    CreateChatCompletionResponse, CreateChatCompletionResponseChoices,
+    CreateChatCompletionStreamResponse, CreateChatCompletionStreamResponseChoices,
+    CreateCompletionResponse, CreateCompletionResponseChoices, CreateCompletionResponseLogprobs,
+    FinishReason, Model,
 };
 use serde::Serialize;
 
@@ -42,6 +43,7 @@ pub(crate) fn chat_completion_response(
     model: String,
     think: Option<String>,
     answer: Option<String>,
+    [prompt_tokens, total_tokens]: [usize; 2],
     finish_reason: Option<FinishReason>,
 ) -> CreateChatCompletionResponse {
     let choices = vec![CreateChatCompletionResponseChoices {
@@ -59,6 +61,13 @@ pub(crate) fn chat_completion_response(
         model,
         choices,
         created,
+        usage: Some(CompletionUsage {
+            completion_tokens: (total_tokens - prompt_tokens) as _,
+            prompt_tokens: prompt_tokens as _,
+            total_tokens: total_tokens as _,
+            completion_tokens_details: None,
+            prompt_tokens_details: None,
+        }),
         ..Default::default()
     }
 }
@@ -90,7 +99,7 @@ pub(crate) fn chat_completion_response_stream(
     }
 }
 
-pub(crate) fn create_completion_response(
+pub(crate) fn completion_response(
     id: usize,
     created: i32,
     model: String,