diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 3b56ad63f375d..b57d79859aec5 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -672,6 +672,17 @@ class BatchRequestInput(OpenAIBaseModel): body: Union[ChatCompletionRequest, ] +class BatchResponseData(OpenAIBaseModel): + # HTTP status code of the response. + status_code: int = 200 + + # An unique identifier for the API request. + request_id: str + + # The body of the response. + body: Union[ChatCompletionResponse, ] + + class BatchRequestOutput(OpenAIBaseModel): """ The per-line object of the batch output and error files @@ -683,7 +694,7 @@ class BatchRequestOutput(OpenAIBaseModel): # inputs. custom_id: str - response: Optional[ChatCompletionResponse] + response: Optional[BatchResponseData] # For requests that failed with a non-HTTP error, this will contain more # information on the cause of the failure. diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 91e567924b59e..b0c0f4ad2f47e 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -10,7 +10,9 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchRequestOutput, - ChatCompletionResponse) + BatchResponseData, + ChatCompletionResponse, + ErrorResponse) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext @@ -77,20 +79,27 @@ async def run_request(chat_serving: OpenAIServingChat, request: BatchRequestInput) -> BatchRequestOutput: chat_request = request.body chat_response = await chat_serving.create_chat_completion(chat_request) + if isinstance(chat_response, ChatCompletionResponse): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, - response=chat_response, + response=BatchResponseData( + body=chat_response, request_id=f"vllm-batch-{random_uuid()}"), error=None, ) - else: + elif isinstance(chat_response, ErrorResponse): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, - response=None, + response=BatchResponseData( + status_code=chat_response.code, + request_id=f"vllm-batch-{random_uuid()}"), error=chat_response, ) + else: + raise ValueError("Request must not be sent in stream mode") + return batch_output