1+ import gradio as gr
2+ import requests
3+ import json
4+ import argparse
5+ import time
6+ import gradio_musa
7+
8+
9+ def parse_args ():
10+ # 创建 ArgumentParser 对象
11+ parser = argparse .ArgumentParser (description = "Start the vLLM server." )
12+
13+ # 添加命令行参数
14+ parser .add_argument (
15+ "--ip" ,
16+ type = str ,
17+ default = "0.0.0.0" , # 如果没有传入--ip,使用默认值
18+ help = "IP address to bind to (default: 0.0.0.0)"
19+ )
20+
21+ parser .add_argument (
22+ "--port" ,
23+ type = str ,
24+ default = "8000" , # 如果没有传入--port,使用默认值
25+ help = "Port number to use (default: 8000)"
26+ )
27+ parser .add_argument (
28+ "--model-name" ,
29+ type = str ,
30+ help = "Model Name"
31+ )
32+
33+ # 解析传入的参数
34+ args = parser .parse_args ()
35+ return args
36+
37+ args = parse_args ()
38+ # 配置 vLLM 推理服务的地址和模型名
39+ VLLM_API_URL = f"http://{ args .ip } :{ args .port } /v1/chat/completions"
40+ MODEL_NAME = args .model_name
41+
42+
43+ # ✅ 流式请求函数
44+ def chat_with_model_streaming (user_input , history ):
45+ messages = [{"role" : "system" , "content" : "You are a helpful assistant." }]
46+ messages .append ({"role" : "user" , "content" : user_input })
47+
48+ payload = {
49+ "model" : MODEL_NAME ,
50+ "messages" : messages ,
51+ "stream" : True # ✅ 启用流式输出
52+ }
53+
54+ history = history or [] # 初始化历史记录
55+ bot_response = "" # 存储逐步生成的回答
56+
57+ # ✅ 记录开始时间
58+ start_time = time .time ()
59+ token_count = 0 # ✅ 记录生成的 Token 数量
60+ first_token_time = None
61+
62+ try :
63+ # ✅ 使用 requests 的流式请求
64+ with requests .post (VLLM_API_URL , json = payload , stream = True ) as response :
65+ response .raise_for_status ()
66+
67+ # ✅ 逐块解析流式响应
68+ for chunk in response .iter_lines ():
69+ if chunk :
70+ chunk_str = chunk .decode ("utf-8" ).strip ()
71+ if chunk_str .startswith ("data: " ):
72+ chunk_data = chunk_str [6 :] # 去掉 "data: " 前缀
73+ if chunk_data != "[DONE]" :
74+ try :
75+ chunk_json = json .loads (chunk_data )
76+ delta = chunk_json ["choices" ][0 ]["delta" ]
77+ if "content" in delta :
78+ bot_response += delta ["content" ]
79+ # ✅ 逐步更新聊天记录
80+ token_count += 1 # ✅ 每个 Token 计数
81+ if first_token_time is None and token_count > 0 :
82+ first_token_time = time .time ()
83+
84+ yield history + [(user_input , bot_response )], "" , "推理中..."
85+ except json .JSONDecodeError :
86+ pass
87+ # ✅ 记录结束时间 & 计算时长
88+ first_token_latency = first_token_time - start_time if first_token_time is not None else 0
89+ elapsed_time = time .time () - first_token_time
90+ tps = token_count / elapsed_time if elapsed_time > 0 else 0 # ✅ 计算 Tokens Per Second
91+ speed_text = f"⏳ 首字延迟: { first_token_latency :.2f} 秒 | ⏱️ 耗时: { elapsed_time :.2f} 秒 | 🔢 Tokens: { token_count } | ⚡ 速度: { tps :.2f} TPS" # ⏳
92+ yield history + [(user_input , bot_response )], "" , speed_text # ✅ 返回推理速度
93+
94+ except Exception as e :
95+ bot_response = f"❌ 推理失败: { str (e )} "
96+ yield history + [(user_input , bot_response )], ""
97+
98+
99+
100+ # ✅ 清除聊天记录 & 计时器
101+ def clear_chat ():
102+ return [], "" , "⏳ 首字延迟: 0.00 秒 | ⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS" # ✅ 清空所有 UI
103+
104+ # 构建 Gradio 界面
105+ with gradio_musa .Blocks () as demo :
106+ # gr.Markdown("## 💬 Web UI 接入 vLLM 模型(流式输出)")
107+ chatbot = gr .Chatbot (label = "Running on MTT S4000" )
108+ msg_input = gr .Textbox (placeholder = "请输入你的问题" , label = "输入..." , lines = 1 , autofocus = True )
109+
110+ speed_display = gr .Textbox (label = "推理速度" , value = "⏳ 首字延迟: 0.00 秒 | ⏱️ 耗时: 0.00 秒 | 🔢 Tokens: 0 | ⚡ 速度: 0.00 TPS" , interactive = False ) # >✅ 显示推理速度
111+
112+ # clear = gr.Button("清除")
113+ # submit = gr.Button("提交")
114+ with gr .Row ():
115+ submit_btn = gr .Button (value = "提交" )
116+ clear_btn = gr .Button ("清除历史" ) # ✅ 添加清除按钮
117+
118+ # ✅ 使用流式函数
119+ msg_input .submit (chat_with_model_streaming , inputs = [msg_input , chatbot ], outputs = [chatbot , msg_input , speed_display ]) # ✅ 按 Enter 触发
120+ submit_btn .click (chat_with_model_streaming , inputs = [msg_input , chatbot ], outputs = [chatbot , msg_input , speed_display ]) # ✅ 按按钮触发
121+ clear_btn .click (clear_chat , inputs = [], outputs = [chatbot , msg_input , speed_display ]) # ✅ 清除聊天 & 计时
122+
123+ demo .queue () # ✅ 允许流式数据传输
124+ demo .launch (server_name = args .ip )
0 commit comments