-
Notifications
You must be signed in to change notification settings - Fork 214
Token-in vLLM endpoint #1422
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Token-in vLLM endpoint #1422
Changes from all commits
2ca61f8
6eeb4e1
74821ea
02485a1
2d0bcab
429a462
31f111b
2666f41
dc45983
08b13d8
eda2348
d2f530d
5a41f94
162f05f
84c00cb
9a0fc7d
8fdd5e7
2b4052b
7725066
e815f44
eb99034
37035e8
a339566
bcd1ac4
035bcda
2c0bd59
675a772
dbebae9
44c493d
8ef375b
ddfb878
09355e4
21b85e6
8f7a090
8e0c984
4ce7321
17dd9ad
6104f53
3254f53
6a50b19
4961e4d
e627788
c1aa492
8ccf327
14c77d3
beb2c29
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,5 +27,4 @@ max_tokens = 1024 | |
| [trainer] # Default trainer config | ||
|
|
||
| [inference.parallel] | ||
| dp = 3 | ||
| tp = 2 | ||
| dp = 6 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,20 @@ | ||
| from argparse import Namespace | ||
| from collections.abc import AsyncIterator | ||
| from contextlib import asynccontextmanager | ||
| from http import HTTPStatus | ||
| from typing import Any, Optional | ||
|
|
||
| from fastapi.responses import JSONResponse, StreamingResponse | ||
| from vllm.entrypoints.chat_utils import load_chat_template | ||
| from vllm.entrypoints.logger import RequestLogger | ||
| from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse | ||
| from vllm.entrypoints.utils import load_aware_call, with_cancellation | ||
|
|
||
| from prime_rl.inference.patches import monkey_patch_prometheus_stat_logger_for_lora_in_dp_mode | ||
| from prime_rl.inference.vllm.serving_chat_with_tokens import ( | ||
| ChatCompletionRequestWithTokens, | ||
| OpenAIServingChatWithTokens, | ||
| ) | ||
|
|
||
| # Monkeypatch PrometheusStatLogger to avoid NotImplementedError for LoRA in DP mode | ||
| monkey_patch_prometheus_stat_logger_for_lora_in_dp_mode() | ||
|
|
@@ -13,17 +24,20 @@ | |
|
|
||
| import uvloop | ||
| import vllm.envs as envs | ||
| from fastapi import Request | ||
| from vllm.config import LogprobsMode | ||
| from fastapi import Depends, HTTPException, Request | ||
| from vllm.config import LogprobsMode, VllmConfig | ||
| from starlette.datastructures import State | ||
| from vllm.engine.arg_utils import AsyncEngineArgs | ||
| from vllm.engine.protocol import EngineClient | ||
| from vllm.entrypoints.launcher import serve_http | ||
| from vllm.entrypoints.openai.api_server import ( | ||
| base, | ||
| build_app, | ||
| build_async_engine_client_from_engine_args, | ||
| init_app_state, | ||
| load_log_config, | ||
| maybe_register_tokenizer_info_endpoint, | ||
| validate_json_request, | ||
| ) | ||
| from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args | ||
| from vllm.entrypoints.openai.tool_parsers import ToolParserManager | ||
|
|
@@ -60,6 +74,49 @@ async def custom_build_async_engine_client( | |
| yield engine | ||
|
|
||
|
|
||
| async def custom_init_app_state(engine_client: EngineClient, vllm_config: VllmConfig, state: State, args: Namespace): | ||
| await init_app_state(engine_client, vllm_config, state, args) | ||
|
|
||
| # Repeat from init_app_state to have | ||
| if args.enable_log_requests: | ||
| request_logger = RequestLogger(max_log_len=args.max_log_len) | ||
| else: | ||
| request_logger = None | ||
|
|
||
| model_config = vllm_config.model_config | ||
|
|
||
| if envs.VLLM_USE_V1: | ||
| supported_tasks = await engine_client.get_supported_tasks() # type: ignore | ||
| else: | ||
| supported_tasks = model_config.supported_tasks | ||
|
|
||
| resolved_chat_template = load_chat_template(args.chat_template) | ||
|
|
||
| # Also serve OAI chat completion tokens | ||
| state.openai_serving_chat_with_tokens = ( | ||
| OpenAIServingChatWithTokens( | ||
| engine_client, | ||
| model_config, | ||
| state.openai_serving_models, | ||
| args.response_role, | ||
| request_logger=request_logger, | ||
| chat_template=resolved_chat_template, | ||
| chat_template_content_format=args.chat_template_content_format, | ||
| return_tokens_as_token_ids=args.return_tokens_as_token_ids, | ||
| enable_auto_tools=args.enable_auto_tool_choice, | ||
| exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, | ||
| tool_parser=args.tool_call_parser, | ||
| reasoning_parser=args.reasoning_parser, | ||
| enable_prompt_tokens_details=args.enable_prompt_tokens_details, | ||
| enable_force_include_usage=args.enable_force_include_usage, | ||
| enable_log_outputs=args.enable_log_outputs, | ||
| log_error_stack=args.log_error_stack, | ||
| ) | ||
| if "generate" in supported_tasks | ||
| else None | ||
| ) | ||
|
|
||
|
|
||
| # Copied from vllm/entrypoints/openai/api_server.py | ||
| # Only difference is that we inject custom routes and build async engine client differently | ||
| async def custom_run_server_worker(listen_address, sock, args, client_config=None, **uvicorn_kwargs) -> None: | ||
|
|
@@ -106,8 +163,41 @@ async def _init_broadcaster(request: Request): | |
| ) | ||
| return {"status": "ok"} | ||
|
|
||
| def chat_with_tokens(request: Request) -> Optional[OpenAIServingChatWithTokens]: | ||
| return request.app.state.openai_serving_chat_with_tokens | ||
|
|
||
| @app.post( | ||
| "/v1/chat/completions/tokens", | ||
| dependencies=[Depends(validate_json_request)], | ||
| responses={ | ||
| HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, | ||
| HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, | ||
| HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, | ||
| HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, | ||
| }, | ||
| ) | ||
| @with_cancellation | ||
| @load_aware_call | ||
| async def _chat_with_tokens(request: ChatCompletionRequestWithTokens, raw_request: Request): | ||
| handler = chat_with_tokens(raw_request) | ||
| if handler is None: | ||
| return base(raw_request).create_error_response( | ||
| message="The model does not support Chat Completions API" | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Error response not wrapped in JSONResponse with status codeWhen |
||
| try: | ||
| generator = await handler.create_chat_completion_with_tokens(request, raw_request) | ||
| except Exception as e: | ||
| raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)) from e | ||
| if isinstance(generator, ErrorResponse): | ||
| return JSONResponse(content=generator.model_dump(), status_code=generator.error.code) | ||
|
|
||
| elif isinstance(generator, ChatCompletionResponse): | ||
| return JSONResponse(content=generator.model_dump()) | ||
|
|
||
| return StreamingResponse(content=generator, media_type="text/event-stream") | ||
|
|
||
| vllm_config = await engine_client.get_vllm_config() | ||
| await init_app_state(engine_client, vllm_config, app.state, args) | ||
| await custom_init_app_state(engine_client, vllm_config, app.state, args) | ||
|
|
||
| # This hack allows us to update lora adapters in-place by skipping the check for already loaded adapters. | ||
| async def do_nothing(*args, **kwargs): | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.