Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[V1] AsyncLLMEngine #9826

Draft
wants to merge 65 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 55 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
8f8662e
prototype
robertgshaw2-neuralmagic Oct 26, 2024
01c4ca8
revert spurious 2.5 changes
robertgshaw2-neuralmagic Oct 26, 2024
1ad8a48
stash
robertgshaw2-neuralmagic Oct 26, 2024
f9084f6
cleanup
robertgshaw2-neuralmagic Oct 26, 2024
72bccd9
add MQLLMEnginev1
robertgshaw2-neuralmagic Oct 26, 2024
a6cab52
work with MQLLMEngine
robertgshaw2-neuralmagic Oct 27, 2024
885ed16
format
robertgshaw2-neuralmagic Oct 27, 2024
3ed66cf
cleanup formatting
robertgshaw2-neuralmagic Oct 27, 2024
8ae8ce9
revert exmple change
robertgshaw2-neuralmagic Oct 27, 2024
5c72515
update comment
robertgshaw2-neuralmagic Oct 27, 2024
f9b33fa
formatting
robertgshaw2-neuralmagic Oct 27, 2024
82539b9
updated
robertgshaw2-neuralmagic Oct 27, 2024
d42a54e
stash
robertgshaw2-neuralmagic Oct 27, 2024
3a2d02a
format
robertgshaw2-neuralmagic Oct 27, 2024
6028ee1
Merge branch 'main' into rs-prototype-2
robertgshaw2-neuralmagic Oct 27, 2024
6bd37c1
update
robertgshaw2-neuralmagic Oct 27, 2024
196d822
revert bind/connect
robertgshaw2-neuralmagic Oct 27, 2024
a089cd1
revert comment
robertgshaw2-neuralmagic Oct 27, 2024
974aa06
formatting
robertgshaw2-neuralmagic Oct 27, 2024
fe1e1b4
formatting tweaks
robertgshaw2-neuralmagic Oct 27, 2024
9c27fbb
move detokenizer into engine
robertgshaw2-neuralmagic Oct 27, 2024
95b5af1
format
robertgshaw2-neuralmagic Oct 27, 2024
3999279
stash
robertgshaw2-neuralmagic Oct 27, 2024
b4dd571
revert bad import
robertgshaw2-neuralmagic Oct 27, 2024
f01f992
format
robertgshaw2-neuralmagic Oct 28, 2024
be333fa
format
robertgshaw2-neuralmagic Oct 28, 2024
aefb498
add files
robertgshaw2-neuralmagic Oct 28, 2024
6d7f473
stash
robertgshaw2-neuralmagic Oct 28, 2024
f431f8a
update
robertgshaw2-neuralmagic Oct 29, 2024
be431e4
update
robertgshaw2-neuralmagic Oct 29, 2024
36b7fa5
fix api client example to work with v1
robertgshaw2-neuralmagic Oct 29, 2024
3a5ce74
formatting
robertgshaw2-neuralmagic Oct 29, 2024
0d0251e
updated
robertgshaw2-neuralmagic Oct 29, 2024
046d78f
update
robertgshaw2-neuralmagic Oct 29, 2024
34c0665
update
robertgshaw2-neuralmagic Oct 29, 2024
52b790f
stash
robertgshaw2-neuralmagic Oct 30, 2024
4f9a86e
Stash
robertgshaw2-neuralmagic Oct 30, 2024
697b98f
stash
robertgshaw2-neuralmagic Oct 30, 2024
fa5c01d
LLMEngineWorking
robertgshaw2-neuralmagic Oct 30, 2024
0ca42d8
format
robertgshaw2-neuralmagic Oct 30, 2024
b6497d5
updated
robertgshaw2-neuralmagic Oct 30, 2024
ae88c73
updated
robertgshaw2-neuralmagic Oct 30, 2024
2161152
update
robertgshaw2-neuralmagic Oct 31, 2024
6a57297
aded processor
robertgshaw2-neuralmagic Oct 31, 2024
3665602
udpated
robertgshaw2-neuralmagic Oct 31, 2024
ed567ca
updated
robertgshaw2-neuralmagic Oct 31, 2024
f4005da
updated formats
robertgshaw2-neuralmagic Oct 31, 2024
67a53ed
revert
robertgshaw2-neuralmagic Oct 31, 2024
458b54f
finished
robertgshaw2-neuralmagic Oct 31, 2024
75ff707
updated
robertgshaw2-neuralmagic Oct 31, 2024
669648f
split core process into separate class
njhill Oct 31, 2024
127f09c
stash
robertgshaw2-neuralmagic Oct 31, 2024
99f683e
Merge pull request #22 from njhill/rework-splitcore
robertgshaw2-neuralmagic Oct 31, 2024
dc6163c
updated
robertgshaw2-neuralmagic Oct 31, 2024
d21cb8f
updated
robertgshaw2-neuralmagic Oct 31, 2024
565ffa6
working again
robertgshaw2-neuralmagic Oct 31, 2024
2960fbc
format
robertgshaw2-neuralmagic Oct 31, 2024
5d23709
updated
robertgshaw2-neuralmagic Oct 31, 2024
f2f2e40
updated
robertgshaw2-neuralmagic Oct 31, 2024
c10c9d8
better interface
robertgshaw2-neuralmagic Oct 31, 2024
b8767a9
formatting
robertgshaw2-neuralmagic Oct 31, 2024
ab783e1
format
robertgshaw2-neuralmagic Oct 31, 2024
423f47d
update
robertgshaw2-neuralmagic Oct 31, 2024
7c977d3
updated
robertgshaw2-neuralmagic Nov 1, 2024
3c14bdf
format
robertgshaw2-neuralmagic Nov 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions examples/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def post_http_request(prompt: str,
pload = {
"prompt": prompt,
"n": n,
"use_beam_search": True,
"temperature": 0.0,
"max_tokens": 16,
"stream": stream,
Expand Down Expand Up @@ -58,7 +57,7 @@ def get_response(response: requests.Response) -> List[str]:
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=4)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
args = parser.parse_args()
Expand All @@ -77,8 +76,8 @@ def get_response(response: requests.Response) -> List[str]:
num_printed_lines = 0
for i, line in enumerate(h):
num_printed_lines += 1
print(f"Beam candidate {i}: {line!r}", flush=True)
print(f"Output {i}: {line!r}", flush=True)
else:
output = get_response(response)
for i, line in enumerate(output):
print(f"Beam candidate {i}: {line!r}", flush=True)
print(f"Output {i}: {line!r}", flush=True)
7 changes: 6 additions & 1 deletion vllm/entrypoints/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,20 @@
from fastapi.responses import JSONResponse, Response, StreamingResponse

from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.launcher import serve_http
from vllm.envs import VLLM_USE_V1
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
from vllm.utils import (FlexibleArgumentParser, iterate_with_cancellation,
random_uuid)
from vllm.version import __version__ as VLLM_VERSION

if VLLM_USE_V1:
from vllm.v1.engine.async_llm_engine import AsyncLLMEngine
else:
from vllm.engine.async_llm_engine import AsyncLLMEngine

Check failure on line 30 in vllm/entrypoints/api_server.py

View workflow job for this annotation

GitHub Actions / mypy (3.8)

Incompatible import of "AsyncLLMEngine" (imported name has type "Type[vllm.engine.async_llm_engine.AsyncLLMEngine]", local name has type "Type[vllm.v1.engine.async_llm_engine.AsyncLLMEngine]") [assignment]

Check failure on line 30 in vllm/entrypoints/api_server.py

View workflow job for this annotation

GitHub Actions / mypy (3.10)

Incompatible import of "AsyncLLMEngine" (imported name has type "Type[vllm.engine.async_llm_engine.AsyncLLMEngine]", local name has type "Type[vllm.v1.engine.async_llm_engine.AsyncLLMEngine]") [assignment]

Check failure on line 30 in vllm/entrypoints/api_server.py

View workflow job for this annotation

GitHub Actions / mypy (3.11)

Incompatible import of "AsyncLLMEngine" (imported name has type "Type[vllm.engine.async_llm_engine.AsyncLLMEngine]", local name has type "Type[vllm.v1.engine.async_llm_engine.AsyncLLMEngine]") [assignment]

Check failure on line 30 in vllm/entrypoints/api_server.py

View workflow job for this annotation

GitHub Actions / mypy (3.12)

Incompatible import of "AsyncLLMEngine" (imported name has type "Type[vllm.engine.async_llm_engine.AsyncLLMEngine]", local name has type "Type[vllm.v1.engine.async_llm_engine.AsyncLLMEngine]") [assignment]

logger = init_logger("vllm.entrypoints.api_server")

TIMEOUT_KEEP_ALIVE = 5 # seconds.
Expand Down
27 changes: 20 additions & 7 deletions vllm/v1/core/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from collections import deque
from dataclasses import dataclass
from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
from typing import Deque, Dict, Iterable, List, Optional, Set, Union

from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from vllm.logger import init_logger
from vllm.multimodal import MultiModalDataDict
from vllm.sampling_params import SamplingParams
from vllm.v1.core.kv_cache_manager import KVCacheManager
from vllm.v1.engine import EngineCoreOutput
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus

Expand Down Expand Up @@ -227,13 +228,12 @@ def update_from_output(
self,
scheduler_output: "SchedulerOutput",
model_runner_output: "ModelRunnerOutput",
) -> List[Tuple[Request, int]]:
) -> List[EngineCoreOutput]:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Im not sure it makes sense for this method to be in scheduler.py

The only item related to setting the scheduler here is updating which self.running

# NOTE(woosuk): This method doesn't consider speculative decoding.
sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
num_scheduled_tokens = scheduler_output.num_scheduled_tokens
new_running: List[Request] = []
# (request, num_sampled_tokens)
sampled: List[Tuple[Request, int]] = []
engine_core_outputs: List[EngineCoreOutput] = []
for request in self.running:
req_id = request.request_id
request.num_computed_tokens += num_scheduled_tokens[req_id]
Expand All @@ -247,17 +247,30 @@ def update_from_output(
# generates at most one token at each step.
token_id = sampled_token_ids[req_index]
request.output_token_ids.append(token_id)
sampled.append((request, 1))
num_new_tokens = 1

# TODO: Update the KV cache manager for prefix caching.

# Check if the request is finished.
# Check for stop and update request state.
# This must be called before me make the EngineCoreOutput.
stopped = self._check_stop(request)

# Add EngineCoreOutput for this Request.
output = EngineCoreOutput(
request_id=req_id,
new_token_ids=request.output_token_ids[-num_new_tokens:],
finished=request.is_finished(),
finish_reason=request.get_finished_reason(),
stop_reason=request.stop_reason)
engine_core_outputs.append(output)

# Breakout of the loop.
if stopped:
continue

new_running.append(request)
self.running = new_running
return sampled
return engine_core_outputs

def _check_stop(self, request: Request) -> bool:
if (request.num_tokens >= self.max_model_len
Expand Down
62 changes: 62 additions & 0 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import asyncio
from dataclasses import dataclass
from typing import List, Optional, Union

import msgspec

from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import RequestOutputKind, SamplingParams

LLM_ENGINE_CORE_READY_STR = "READY"
POLLING_TIMEOUT_MS = 5000


@dataclass
class DetokenizerRequest:

request_id: str
prompt: Optional[str]
prompt_token_ids: List[int]
skip_special_tokens: bool
spaces_between_special_tokens: bool
output_kind: RequestOutputKind

# Queue for streaming outputs to clients.
output_queue: Optional[asyncio.Queue[RequestOutput]] = None


class EngineCoreRequest(msgspec.Struct):

# NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
# but this object is currently not playing well with msgspec
# due to circular imports and typing we have in data.py

request_id: str
#NOTE(Nick): I don't think we need to pass prompt here since it should
# always be tokenized?
# prompt: Optional[str]
prompt_token_ids: List[int]
sampling_params: SamplingParams
eos_token_id: Optional[int]
arrival_time: float
lora_request: Optional[LoRARequest]


@dataclass
class EngineCoreOutput:

request_id: str
new_token_ids: List[int]
finished: bool
finish_reason: Optional[str] = None
stop_reason: Union[int, str, None] = None


class EngineCoreOutputs(msgspec.Struct):

#NOTE(Nick): We could consider ways to make this more compact,
# e.g. columnwise layout and using an int enum for finish/stop reason

# [num_reqs]
outputs: List[EngineCoreOutput]
Loading
Loading