forked from eliran89c/self-hosted-llm-on-eks
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchat.py
71 lines (61 loc) · 2.14 KB
/
chat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from openai import OpenAI
import gradio as gr
# Define the predict function
def predict(message, history, temperature, max_tokens, model):
# Create a client
client = OpenAI(base_url=f"http://localhost:8000/v1", api_key="test-key")
history_openai_format = []
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human})
history_openai_format.append(
{"role": "assistant", "content": assistant})
history_openai_format.append({"role": "user", "content": message})
response = client.chat.completions.create(
model=model,
messages=history_openai_format,
temperature=temperature,
max_tokens=max_tokens,
stream=True
)
partial_message = ""
for chunk in response:
if chunk.choices[0].delta.content is not None:
partial_message = partial_message + chunk.choices[0].delta.content
yield partial_message
if __name__ == "__main__":
# Launch the chat interface
gr.ChatInterface(
predict,
title="vLLM Demo Chatbot",
description="vLLM Demo Chatbot",
additional_inputs_accordion=gr.Accordion(open=True, label="Settings"),
additional_inputs=[
gr.Slider(
label="Temperature",
value=0.6,
minimum=0.0,
maximum=1.0,
step=0.05,
interactive=True,
info="Higher values produce more diverse outputs",
),
gr.Slider(
label="Max new tokens",
value=500,
minimum=0,
maximum=4096,
step=64,
interactive=True,
info="The maximum numbers of new tokens",
),
gr.Dropdown(
choices=[
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
],
interactive=True,
value="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
label="Model",
),
]
).launch()