-
Notifications
You must be signed in to change notification settings - Fork 788
Expand file tree
/
Copy pathopenai_server.py
More file actions
137 lines (122 loc) · 6.37 KB
/
openai_server.py
File metadata and controls
137 lines (122 loc) · 6.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from typing import Literal
from typing_extensions import TypedDict
from .engine import EngineArgs
def get_openai_server_config(
model_name: str,
base_model: str,
log_file: str,
lora_path: str | None = None,
config: "OpenAIServerConfig | None" = None,
) -> "OpenAIServerConfig":
import os
if config is None:
config = OpenAIServerConfig()
log_file = config.get("log_file", log_file)
# Build LoRA modules list for multi-checkpoint support.
# Only register the explicit step-qualified name so unsuffixed
# trainable model names fail loudly.
lora_modules: list[str] | None = None
if lora_path:
step = int(os.path.basename(lora_path))
lora_modules = [f'{{"name": "{model_name}@{step}", "path": "{lora_path}"}}']
server_args = ServerArgs(
api_key="default",
lora_modules=lora_modules,
return_tokens_as_token_ids=True,
enable_auto_tool_choice=True,
tool_call_parser="hermes",
)
server_args.update(config.get("server_args", {}))
engine_args = EngineArgs(
model=base_model,
# Serve the base model under its own HF name when LoRA is enabled so
# `model.name` does not silently route to a stale/incorrect adapter.
served_model_name=base_model if lora_path else model_name,
generation_config="vllm",
)
engine_args.update(config.get("engine_args", {}))
return OpenAIServerConfig(
log_file=log_file, server_args=server_args, engine_args=engine_args
)
class OpenAIServerConfig(TypedDict, total=False):
"""
Server configuration.
Args:
log_file: Path to the log file.
server_args: Arguments for the vLLM OpenAI-compatible server.
engine_args: Additional vLLM engine arguments for the OpenAI-compatible server.
Note that since the vLLM engine is initialized with Unsloth,
these additional arguments will only have an effect if the
OpenAI-compatible server uses them elsewhere.
"""
log_file: str
server_args: "ServerArgs"
engine_args: "EngineArgs"
class ServerArgs(TypedDict, total=False):
"""Arguments for the vLLM OpenAI-compatible server, not including AsyncEngineArgs.
Fields:
host: Host name for the server. If None, will listen on all available interfaces (0.0.0.0).
port: Port number for the server to listen on.
uvicorn_log_level: Log level for the uvicorn server.
allow_credentials: Whether to allow credentials in CORS requests.
allowed_origins: JSON string that will be parsed into a list of allowed CORS origins. Defaults to ["*"].
allowed_methods: JSON string that will be parsed into a list of allowed HTTP methods. Defaults to ["*"].
allowed_headers: JSON string that will be parsed into a list of allowed HTTP headers. Defaults to ["*"].
api_key: API key required for authentication. If None, no authentication is required.
lora_modules: List of LoRA module configurations. Each string is either in 'name=path' format or a JSON object.
prompt_adapters: List of prompt adapter configurations. Each string is in 'name=path' format.
chat_template: Path to chat template file or the template in single-line form.
chat_template_content_format: Format to render message content within chat templates.
response_role: Role name to return when request.add_generation_prompt is true.
ssl_keyfile: Path to SSL key file for HTTPS.
ssl_certfile: Path to SSL certificate file for HTTPS.
ssl_ca_certs: Path to CA certificates file for HTTPS.
enable_ssl_refresh: Whether to refresh SSL context when certificate files change.
ssl_cert_reqs: SSL certificate requirements (see stdlib ssl module's CERT_* constants).
root_path: FastAPI root_path when app is behind a path-based routing proxy.
middleware: List of additional ASGI middleware to apply to the app. Each string is an import path.
return_tokens_as_token_ids: When max_logprobs is specified, represent tokens as 'token_id:{token_id}' strings.
disable_frontend_multiprocessing: Run OpenAI frontend server in same process as model serving engine.
enable_request_id_headers: Add X-Request-Id header to responses (may impact performance at high QPS).
enable_auto_tool_choice: Enable automatic tool choice for supported models.
tool_call_parser: Parser to use for model-generated tool calls into OpenAI API format.
tool_parser_plugin: Plugin for registering custom tool call parsers.
max_log_len: Maximum number of prompt characters or prompt ID numbers to print in logs.
disable_fastapi_docs: Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint.
enable_prompt_tokens_details: Include prompt_tokens_details in usage statistics.
enable_server_load_tracking: Track server_load_metrics in the app state.
enable_reasoning: Enable reasoning capabilities for supported models.
reasoning_parser: Parser to use for model-generated reasoning into OpenAI API format.
"""
host: str | None
port: int
uvicorn_log_level: Literal["debug", "info", "warning", "error", "critical", "trace"]
allow_credentials: bool
allowed_origins: str # JSON string that will be parsed into list[str]
allowed_methods: str # JSON string that will be parsed into list[str]
allowed_headers: str # JSON string that will be parsed into list[str]
api_key: str | None
lora_modules: list[str] | None # Each string is either 'name=path' or a JSON object
prompt_adapters: list[str] | None # Each string is in 'name=path' format
chat_template: str | None
chat_template_content_format: Literal["auto", "string", "openai"]
response_role: str | None
ssl_keyfile: str | None
ssl_certfile: str | None
ssl_ca_certs: str | None
enable_ssl_refresh: bool
ssl_cert_reqs: int
root_path: str | None
middleware: list[str] # Each string is an import path
return_tokens_as_token_ids: bool
disable_frontend_multiprocessing: bool
enable_request_id_headers: bool
enable_auto_tool_choice: bool
tool_call_parser: str | None
tool_parser_plugin: str
max_log_len: int | None
disable_fastapi_docs: bool
enable_prompt_tokens_details: bool
enable_server_load_tracking: bool
enable_reasoning: bool
reasoning_parser: str | None