vllm-project · robertgshaw2-neuralmagic · Oct 26, 2024 · Oct 26, 2024 · Oct 26, 2024 · Oct 26, 2024
diff --git a/examples/api_client.py b/examples/api_client.py
@@ -26,7 +26,6 @@ def post_http_request(prompt: str,
     pload = {
         "prompt": prompt,
         "n": n,
-        "use_beam_search": True,
         "temperature": 0.0,
         "max_tokens": 16,
         "stream": stream,
@@ -58,7 +57,7 @@ def get_response(response: requests.Response) -> List[str]:
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--n", type=int, default=4)
+    parser.add_argument("--n", type=int, default=1)
     parser.add_argument("--prompt", type=str, default="San Francisco is a")
     parser.add_argument("--stream", action="store_true")
     args = parser.parse_args()
@@ -77,8 +76,8 @@ def get_response(response: requests.Response) -> List[str]:
             num_printed_lines = 0
             for i, line in enumerate(h):
                 num_printed_lines += 1
-                print(f"Beam candidate {i}: {line!r}", flush=True)
+                print(f"Output {i}: {line!r}", flush=True)
     else:
         output = get_response(response)
         for i, line in enumerate(output):
-            print(f"Beam candidate {i}: {line!r}", flush=True)
+            print(f"Output {i}: {line!r}", flush=True)
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
@@ -15,15 +15,20 @@
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
+from vllm.envs import VLLM_USE_V1
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, iterate_with_cancellation,
                         random_uuid)
 from vllm.version import __version__ as VLLM_VERSION
 
+if VLLM_USE_V1:
+    from vllm.v1.engine.async_llm_engine import AsyncLLMEngine
+else:
+    from vllm.engine.async_llm_engine import AsyncLLMEngine
+
 logger = init_logger("vllm.entrypoints.api_server")
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds.

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
@@ -1,12 +1,13 @@
 from collections import deque
 from dataclasses import dataclass
-from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Deque, Dict, Iterable, List, Optional, Set, Union
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
@@ -227,13 +228,12 @@ def update_from_output(
         self,
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
-    ) -> List[Tuple[Request, int]]:
+    ) -> List[EngineCoreOutput]:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
-        # (request, num_sampled_tokens)
-        sampled: List[Tuple[Request, int]] = []
+        engine_core_outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -247,17 +247,30 @@ def update_from_output(
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
                 request.output_token_ids.append(token_id)
-                sampled.append((request, 1))
+                num_new_tokens = 1
+
                 # TODO: Update the KV cache manager for prefix caching.
 
-                # Check if the request is finished.
+                # Check for stop and update request state.
+                # This must be called before me make the EngineCoreOutput.
                 stopped = self._check_stop(request)
+
+                # Add EngineCoreOutput for this Request.
+                output = EngineCoreOutput(
+                    request_id=req_id,
+                    new_token_ids=request.output_token_ids[-num_new_tokens:],
+                    finished=request.is_finished(),
+                    finish_reason=request.get_finished_reason(),
+                    stop_reason=request.stop_reason)
+                engine_core_outputs.append(output)
+
+                # Breakout of the loop.
                 if stopped:
                     continue
 
             new_running.append(request)
         self.running = new_running
-        return sampled
+        return engine_core_outputs
 
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
@@ -0,0 +1,62 @@
+import asyncio
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import msgspec
+
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+
+LLM_ENGINE_CORE_READY_STR = "READY"
+POLLING_TIMEOUT_MS = 5000
+
+
+@dataclass
+class DetokenizerRequest:
+
+    request_id: str
+    prompt: Optional[str]
+    prompt_token_ids: List[int]
+    skip_special_tokens: bool
+    spaces_between_special_tokens: bool
+    output_kind: RequestOutputKind
+
+    # Queue for streaming outputs to clients.
+    output_queue: Optional[asyncio.Queue[RequestOutput]] = None
+
+
+class EngineCoreRequest(msgspec.Struct):
+
+    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
+    # but this object is currently not playing well with msgspec
+    # due to circular imports and typing we have in data.py
+
+    request_id: str
+    #NOTE(Nick): I don't think we need to pass prompt here since it should
+    # always be tokenized?
+    #    prompt: Optional[str]
+    prompt_token_ids: List[int]
+    sampling_params: SamplingParams
+    eos_token_id: Optional[int]
+    arrival_time: float
+    lora_request: Optional[LoRARequest]
+
+
+@dataclass
+class EngineCoreOutput:
+
+    request_id: str
+    new_token_ids: List[int]
+    finished: bool
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+
+
+class EngineCoreOutputs(msgspec.Struct):
+
+    #NOTE(Nick): We could consider ways to make this more compact,
+    # e.g. columnwise layout and using an int enum for finish/stop reason
+
+    # [num_reqs]
+    outputs: List[EngineCoreOutput]