diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md new file mode 100644 index 0000000000000..40462ce1eb78c --- /dev/null +++ b/examples/offline_inference_openai.md @@ -0,0 +1,172 @@ +# Offline Inference with the OpenAI Batch file format + + **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API. + + ## File Format + + The OpenAI batch file format consists of a series of json objects on new lines. + + [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) + + Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. + + **NOTE:** We currently only support to `/v1/chat/completions` endpoint (embeddings and completions coming soon). + + ## Pre-requisites + +* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`. +* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`. + - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens) + - Install the token on your machine (Run `huggingface-cli login`). + - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions. + + + ## Example: Running with a local file + + ### Step 1: Create your batch file + + To follow along with this example, you can download the example batch, or create your own batch file in your working directory. + + ``` + wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl + ``` + + Once you've created your batch file it should look like this + + ``` + $ cat openai_example_batch.jsonl +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} + ``` + + ### Step 2: Run the batch + +The batch running tool is designed to be used from the command line. + +You can run the batch with the following command, which will write its results to a file called `results.jsonl` + +``` +python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +### Step 3: Check your results + +You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl` + +``` +$ cat ../results.jsonl +{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null} +{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null} +``` + +## Example 2: Using remote files + +The batch runner supports remote input and output urls that are accessible via http/https. + +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run + +``` +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +## Example 3: Integrating with AWS S3 + +To integrate with cloud blob storage, we recommend using presigned urls. + +[Learn more about S3 presigned urls here] + +### Additional prerequisites + +* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html). +* The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3. + - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html). +* The `boto3` python package (Run `pip install boto3`) to generate presigned urls. + +### Step 1: Upload your input script + +To follow along with this example, you can download the example batch, or create your own batch file in your working directory. + + ``` + wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl + ``` + + Once you've created your batch file it should look like this + + ``` + $ cat openai_example_batch.jsonl +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} + ``` + +Now upload your batch file to your S3 bucket. + +``` +aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +``` + + +### Step 2: Generate your presigned urls + +Presigned put urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names. + +(The script is adapted from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py) + +``` +import boto3 +from botocore.exceptions import ClientError + +def generate_presigned_url(s3_client, client_method, method_parameters, expires_in): + """ + Generate a presigned Amazon S3 URL that can be used to perform an action. + + :param s3_client: A Boto3 Amazon S3 client. + :param client_method: The name of the client method that the URL performs. + :param method_parameters: The parameters of the specified client method. + :param expires_in: The number of seconds the presigned URL is valid for. + :return: The presigned URL. + """ + try: + url = s3_client.generate_presigned_url( + ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in + ) + except ClientError: + raise + return url + + +s3_client = boto3.client("s3") +input_url = generate_presigned_url( + s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 +) +output_url = generate_presigned_url( + s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 +) +print(f"{input_url=}") +print(f"{output_url=}") +``` + +This script should output + +``` +input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091' +output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091' +``` + +### Step 3: Run the batch runner using your presigned urls + +You can now run the batch runner, using the urls generated in the previous section. + +``` +python -m vllm.entrypoints.openai.run_batch \ + -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + --model --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +### Step 4: View your results + +Your results are now on S3. You can view them in your terminal by running + +``` +aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl - +``` diff --git a/examples/openi_example_batch.jsonl b/examples/openi_example_batch.jsonl new file mode 100644 index 0000000000000..5aa7e185c180a --- /dev/null +++ b/examples/openi_example_batch.jsonl @@ -0,0 +1,2 @@ +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} diff --git a/requirements-common.txt b/requirements-common.txt index bd779d5acb68e..cc4b15d877d0f 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -8,6 +8,7 @@ py-cpuinfo transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. tokenizers >= 0.19.1 # Required for Llama 3. fastapi +aiohttp openai uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. diff --git a/tests/entrypoints/test_openai_run_batch.py b/tests/entrypoints/test_openai_run_batch.py new file mode 100644 index 0000000000000..5de28513ca391 --- /dev/null +++ b/tests/entrypoints/test_openai_run_batch.py @@ -0,0 +1,53 @@ +import subprocess +import sys +import tempfile + +from vllm.entrypoints.openai.protocol import BatchRequestOutput + +# ruff: noqa: E501 +INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" + +INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" + + +def test_e2e(): + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(INPUT_BATCH) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", + input_file.name, "-o", output_file.name, "--model", + "NousResearch/Meta-Llama-3-8B-Instruct" + ], ) + proc.communicate() + proc.wait() + assert proc.returncode == 0, f"{proc=}" + + contents = output_file.read() + for line in contents.strip().split("\n"): + # Ensure that the output format conforms to the openai api. + # Validation should throw if the schema is wrong. + BatchRequestOutput.model_validate_json(line) + + +def test_e2e_invalid_input(): + """ + Ensure that we fail when the input doesn't conform to the openai api. + """ + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(INVALID_INPUT_BATCH) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", + input_file.name, "-o", output_file.name, "--model", + "NousResearch/Meta-Llama-3-8B-Instruct" + ], ) + proc.communicate() + proc.wait() + assert proc.returncode != 0, f"{proc=}" diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 35dfa09ac12ba..41e2f77fe56f1 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -526,3 +526,44 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): model: str choices: List[ChatCompletionResponseStreamChoice] usage: Optional[UsageInfo] = Field(default=None) + + +class BatchRequestInput(OpenAIBaseModel): + """ + The per-line object of the batch input file. + + NOTE: Currently only the `/v1/chat/completions` endpoint is supported. + """ + + # A developer-provided per-request id that will be used to match outputs to + # inputs. Must be unique for each request in a batch. + custom_id: str + + # The HTTP method to be used for the request. Currently only POST is + # supported. + method: str + + # The OpenAI API relative URL to be used for the request. Currently + # /v1/chat/completions is supported. + url: str + + # The parameteters of the request. + body: Union[ChatCompletionRequest, ] + + +class BatchRequestOutput(OpenAIBaseModel): + """ + The per-line object of the batch output and error files + """ + + id: str + + # A developer-provided per-request id that will be used to match outputs to + # inputs. + custom_id: str + + response: Optional[ChatCompletionResponse] + + # For requests that failed with a non-HTTP error, this will contain more + # information on the cause of the failure. + error: Optional[Any] diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py new file mode 100644 index 0000000000000..99f1b2d6d091b --- /dev/null +++ b/vllm/entrypoints/openai/run_batch.py @@ -0,0 +1,141 @@ +import argparse +import asyncio +import sys +from io import StringIO + +import aiohttp + +import vllm +from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.openai.protocol import (BatchRequestInput, + BatchRequestOutput, + ChatCompletionResponse) +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.logger import init_logger +from vllm.usage.usage_lib import UsageContext +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="vLLM OpenAI-Compatible batch runner.") + parser.add_argument( + "-i", + "--input-file", + required=True, + type=str, + help= + "The path or url to a single input file. Currently supports local file " + "paths, or the http protocol (http or https). If a URL is specified, " + "the file should be available via HTTP GET.") + parser.add_argument( + "-o", + "--output-file", + required=True, + type=str, + help="The path or url to a single output file. Currently supports " + "local file paths, or web (http or https) urls. If a URL is specified," + " the file should be available via HTTP PUT.") + parser.add_argument("--response-role", + type=nullable_str, + default="assistant", + help="The role name to return if " + "`request.add_generation_prompt=true`.") + + parser = AsyncEngineArgs.add_cli_args(parser) + return parser.parse_args() + + +async def read_file(path_or_url: str) -> str: + if path_or_url.startswith("http://") or path_or_url.startswith("https://"): + async with aiohttp.ClientSession() as session, \ + session.get(path_or_url) as resp: + return await resp.text() + else: + with open(path_or_url, "r") as f: + return f.read() + + +async def write_file(path_or_url: str, data: str) -> None: + if path_or_url.startswith("http://") or path_or_url.startswith("https://"): + async with aiohttp.ClientSession() as session, \ + session.put(path_or_url, data=data.encode("utf-8")): + pass + else: + # We should make this async, but as long as this is always run as a + # standalone program, blocking the event loop won't effect performance + # in this particular case. + with open(path_or_url, "w") as f: + f.write(data) + + +async def run_request(chat_serving: OpenAIServingChat, + request: BatchRequestInput) -> BatchRequestOutput: + chat_request = request.body + chat_response = await chat_serving.create_chat_completion(chat_request) + if isinstance(chat_response, ChatCompletionResponse): + batch_output = BatchRequestOutput( + id=f"vllm-{random_uuid()}", + custom_id=request.custom_id, + response=chat_response, + error=None, + ) + else: + batch_output = BatchRequestOutput( + id=f"vllm-{random_uuid()}", + custom_id=request.custom_id, + response=None, + error=chat_response, + ) + return batch_output + + +async def main(args): + if args.served_model_name is not None: + served_model_names = args.served_model_name + else: + served_model_names = [args.model] + + engine_args = AsyncEngineArgs.from_cli_args(args) + engine = AsyncLLMEngine.from_engine_args( + engine_args, usage_context=UsageContext.OPENAI_API_SERVER) + + # When using single vLLM without engine_use_ray + model_config = await engine.get_model_config() + + openai_serving_chat = OpenAIServingChat( + engine, + model_config, + served_model_names, + args.response_role, + ) + + # Submit all requests in the file to the engine "concurrently". + response_futures = [] + for request_json in (await read_file(args.input_file)).strip().split("\n"): + request = BatchRequestInput.model_validate_json(request_json) + response_futures.append(run_request(openai_serving_chat, request)) + + responses = await asyncio.gather(*response_futures) + + output_buffer = StringIO() + for response in responses: + print(response.model_dump_json(), file=output_buffer) + + output_buffer.seek(0) + await write_file(args.output_file, output_buffer.read().strip()) + + # Temporary workaround for https://github.com/vllm-project/vllm/issues/4789 + sys.exit(0) + + +if __name__ == "__main__": + args = parse_args() + + logger.info("vLLM API server version %s", vllm.__version__) + logger.info("args: %s", args) + + asyncio.run(main(args)) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 65824a2206be9..c86e41c601be0 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -119,7 +119,9 @@ def _parse_chat_message_content( return self._parse_chat_message_content_parts(role, content) async def create_chat_completion( - self, request: ChatCompletionRequest, raw_request: Request + self, + request: ChatCompletionRequest, + raw_request: Optional[Request] = None ) -> Union[ErrorResponse, AsyncGenerator[str, None], ChatCompletionResponse]: """Completion API similar to OpenAI's API. @@ -337,7 +339,7 @@ async def chat_completion_stream_generator( yield "data: [DONE]\n\n" async def chat_completion_full_generator( - self, request: ChatCompletionRequest, raw_request: Request, + self, request: ChatCompletionRequest, raw_request: Optional[Request], result_generator: AsyncIterator[RequestOutput], request_id: str, conversation: List[ConversationMessage] ) -> Union[ErrorResponse, ChatCompletionResponse]: @@ -347,7 +349,7 @@ async def chat_completion_full_generator( final_res: Optional[RequestOutput] = None async for res in result_generator: - if await raw_request.is_disconnected(): + if raw_request is not None and await raw_request.is_disconnected(): # Abort the request if the client disconnects. await self.engine.abort(request_id) return self.create_error_response("Client disconnected")