ART/src/art/dev/openai_server.py at main · OpenPipe/ART · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from typing import Literal

from typing_extensions import TypedDict

from .engine import EngineArgs


def get_openai_server_config(
    model_name: str,
    base_model: str,
    log_file: str,
    lora_path: str | None = None,
    config: "OpenAIServerConfig | None" = None,
) -> "OpenAIServerConfig":
    import os

    if config is None:
        config = OpenAIServerConfig()
    log_file = config.get("log_file", log_file)

    # Build LoRA modules list for multi-checkpoint support.
    # Only register the explicit step-qualified name so unsuffixed
    # trainable model names fail loudly.
    lora_modules: list[str] | None = None
    if lora_path:
        step = int(os.path.basename(lora_path))
        lora_modules = [f'{{"name": "{model_name}@{step}", "path": "{lora_path}"}}']

    server_args = ServerArgs(
        api_key="default",
        lora_modules=lora_modules,
        return_tokens_as_token_ids=True,
        enable_auto_tool_choice=True,
        tool_call_parser="hermes",
    )
    server_args.update(config.get("server_args", {}))
    engine_args = EngineArgs(
        model=base_model,
        # Serve the base model under its own HF name when LoRA is enabled so
        # `model.name` does not silently route to a stale/incorrect adapter.
        served_model_name=base_model if lora_path else model_name,
        generation_config="vllm",
    )
    engine_args.update(config.get("engine_args", {}))
    return OpenAIServerConfig(
        log_file=log_file, server_args=server_args, engine_args=engine_args
    )


class OpenAIServerConfig(TypedDict, total=False):
    """
    Server configuration.

    Args:
        log_file: Path to the log file.
        server_args: Arguments for the vLLM OpenAI-compatible server.
        engine_args: Additional vLLM engine arguments for the OpenAI-compatible server.
                     Note that since the vLLM engine is initialized with Unsloth,
                     these additional arguments will only have an effect if the
                     OpenAI-compatible server uses them elsewhere.
    """

    log_file: str
    server_args: "ServerArgs"
    engine_args: "EngineArgs"


class ServerArgs(TypedDict, total=False):
    """Arguments for the vLLM OpenAI-compatible server, not including AsyncEngineArgs.

    Fields:
        host: Host name for the server. If None, will listen on all available interfaces (0.0.0.0).
        port: Port number for the server to listen on.
        uvicorn_log_level: Log level for the uvicorn server.
        allow_credentials: Whether to allow credentials in CORS requests.
        allowed_origins: JSON string that will be parsed into a list of allowed CORS origins. Defaults to ["*"].
        allowed_methods: JSON string that will be parsed into a list of allowed HTTP methods. Defaults to ["*"].
        allowed_headers: JSON string that will be parsed into a list of allowed HTTP headers. Defaults to ["*"].
        api_key: API key required for authentication. If None, no authentication is required.
        lora_modules: List of LoRA module configurations. Each string is either in 'name=path' format or a JSON object.
        prompt_adapters: List of prompt adapter configurations. Each string is in 'name=path' format.
        chat_template: Path to chat template file or the template in single-line form.
        chat_template_content_format: Format to render message content within chat templates.
        response_role: Role name to return when request.add_generation_prompt is true.
        ssl_keyfile: Path to SSL key file for HTTPS.
        ssl_certfile: Path to SSL certificate file for HTTPS.
        ssl_ca_certs: Path to CA certificates file for HTTPS.
        enable_ssl_refresh: Whether to refresh SSL context when certificate files change.
        ssl_cert_reqs: SSL certificate requirements (see stdlib ssl module's CERT_* constants).
        root_path: FastAPI root_path when app is behind a path-based routing proxy.
        middleware: List of additional ASGI middleware to apply to the app. Each string is an import path.
        return_tokens_as_token_ids: When max_logprobs is specified, represent tokens as 'token_id:{token_id}' strings.
        disable_frontend_multiprocessing: Run OpenAI frontend server in same process as model serving engine.
        enable_request_id_headers: Add X-Request-Id header to responses (may impact performance at high QPS).
        enable_auto_tool_choice: Enable automatic tool choice for supported models.
        tool_call_parser: Parser to use for model-generated tool calls into OpenAI API format.
        tool_parser_plugin: Plugin for registering custom tool call parsers.
        max_log_len: Maximum number of prompt characters or prompt ID numbers to print in logs.
        disable_fastapi_docs: Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint.
        enable_prompt_tokens_details: Include prompt_tokens_details in usage statistics.
        enable_server_load_tracking: Track server_load_metrics in the app state.
        enable_reasoning: Enable reasoning capabilities for supported models.
        reasoning_parser: Parser to use for model-generated reasoning into OpenAI API format.
    """

    host: str | None
    port: int
    uvicorn_log_level: Literal["debug", "info", "warning", "error", "critical", "trace"]
    allow_credentials: bool
    allowed_origins: str  # JSON string that will be parsed into list[str]
    allowed_methods: str  # JSON string that will be parsed into list[str]
    allowed_headers: str  # JSON string that will be parsed into list[str]
    api_key: str | None
    lora_modules: list[str] | None  # Each string is either 'name=path' or a JSON object
    prompt_adapters: list[str] | None  # Each string is in 'name=path' format
    chat_template: str | None
    chat_template_content_format: Literal["auto", "string", "openai"]
    response_role: str | None
    ssl_keyfile: str | None
    ssl_certfile: str | None
    ssl_ca_certs: str | None
    enable_ssl_refresh: bool
    ssl_cert_reqs: int
    root_path: str | None
    middleware: list[str]  # Each string is an import path
    return_tokens_as_token_ids: bool
    disable_frontend_multiprocessing: bool
    enable_request_id_headers: bool
    enable_auto_tool_choice: bool
    tool_call_parser: str | None
    tool_parser_plugin: str
    max_log_len: int | None
    disable_fastapi_docs: bool
    enable_prompt_tokens_details: bool
    enable_server_load_tracking: bool
    enable_reasoning: bool
    reasoning_parser: str | None