[Frontend] Create ErrorResponse instead of raising exceptions in run_batch (vllm-project#8347)

pooyadavoodi · web-flow · commit cea95dfb9418 · 2024-09-11T05:30:11.000Z
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
@@ -8,7 +8,9 @@
 INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 
-{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
 
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
@@ -1,4 +1,5 @@
 import asyncio
+from http import HTTPStatus
 from io import StringIO
 from typing import Awaitable, Callable, List, Optional
 
@@ -135,6 +136,25 @@ async def write_file(path_or_url: str, data: str) -> None:
             f.write(data)
 
 
+def make_error_request_output(request: BatchRequestInput,
+                              error_msg: str) -> BatchRequestOutput:
+    batch_output = BatchRequestOutput(
+        id=f"vllm-{random_uuid()}",
+        custom_id=request.custom_id,
+        response=BatchResponseData(
+            status_code=HTTPStatus.BAD_REQUEST,
+            request_id=f"vllm-batch-{random_uuid()}",
+        ),
+        error=error_msg,
+    )
+    return batch_output
+
+
+async def make_async_error_request_output(
+        request: BatchRequestInput, error_msg: str) -> BatchRequestOutput:
+    return make_error_request_output(request, error_msg)
+
+
 async def run_request(serving_engine_func: Callable,
                       request: BatchRequestInput,
                       tracker: BatchProgressTracker) -> BatchRequestOutput:
@@ -158,7 +178,8 @@ async def run_request(serving_engine_func: Callable,
             error=response,
         )
     else:
-        raise ValueError("Request must not be sent in stream mode")
+        batch_output = make_error_request_output(
+            request, error_msg="Request must not be sent in stream mode")
 
     tracker.completed()
     return batch_output
@@ -225,8 +246,12 @@ async def main(args):
                             tracker))
             tracker.submitted()
         else:
-            raise ValueError("Only /v1/chat/completions and /v1/embeddings are"
-                             "supported in the batch endpoint.")
+            response_futures.append(
+                make_async_error_request_output(
+                    request,
+                    error_msg="Only /v1/chat/completions and "
+                    "/v1/embeddings are supported in the batch endpoint.",
+                ))
 
     with tracker.pbar():
         responses = await asyncio.gather(*response_futures)