Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions run_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# hf download Qwen/Qwen3-0.6B --local-dir /data/local/models/qwen3_06b
# rm /data/local/models/qwen3_06b/tokenizer*

from vllm import LLM, SamplingParams, TokensPrompt
from vllm.sampling_params import RequestOutputKind

prompts = [
"Hello, my name is",
"The president of the United States is",
# "The capital of France is",
# "The future of AI is",
]
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=500,
stop="is",
n=2,
output_kind=RequestOutputKind.CUMULATIVE,
# stop_token_ids=[5],
)
tokens_prompt = TokensPrompt(prompt_token_ids=[2, 3, 4], )

if __name__ == "__main__":
llm = LLM(
model="Qwen/Qwen3-0.6B",
enforce_eager=True,
# skip_tokenizer_init=True,
gpu_memory_utilization=0.8,
disable_log_stats=False,
)
outputs = llm.generate(prompts=prompts, sampling_params=sampling_params)
for output in outputs:
prompt = output.prompt_token_ids
generated_token_ids = output.outputs[0].token_ids
print(f"Prompt: {prompt!r}, Generated tokens: {generated_token_ids!r}")
13 changes: 6 additions & 7 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@
ScoringRequestOutput)
from vllm.plugins.io_processors import get_io_processor
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
SamplingParams)
from vllm.sampling_params import BeamSearchParams, SamplingParams
from vllm.tasks import PoolingTask
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
get_cached_tokenizer,
Expand Down Expand Up @@ -90,7 +89,7 @@ class LLM:
or videos from directories specified by the server file system.
This is a security risk. Should only be enabled in trusted
environments.
allowed_media_domains: If set, only media URLs that belong to this
allowed_media_domains: If set, only media URLs that belong to this
domain can be used for multi-modal inputs.
tensor_parallel_size: The number of GPUs to use for distributed
execution with tensor parallelism.
Expand Down Expand Up @@ -1504,10 +1503,10 @@ def _validate_and_add_requests(
raise ValueError("The lengths of prompts and lora_request "
"must be the same.")

for sp in params if isinstance(params, Sequence) else (params, ):
if isinstance(sp, SamplingParams):
# We only care about the final output
sp.output_kind = RequestOutputKind.FINAL_ONLY
# for sp in params if isinstance(params, Sequence) else (params, ):
# if isinstance(sp, SamplingParams):
# # We only care about the final output
# sp.output_kind = RequestOutputKind.FINAL_ONLY

# Add requests to the engine.
it = prompts
Expand Down
4 changes: 4 additions & 0 deletions vllm/v1/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@

# 1) Get EngineCoreOutput from the EngineCore.
outputs = self.engine_core.get_output()
logger.info(f"{len(outputs.outputs)=}")

Check failure on line 277 in vllm/v1/engine/llm_engine.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

vllm/v1/engine/llm_engine.py:277:21: G004 Logging statement uses f-string

# 2) Process EngineCoreOutputs.
iteration_stats = IterationStats() if self.log_stats else None
Expand All @@ -294,6 +295,9 @@
)
self.do_log_stats_with_interval()

logger.info(
f"EngineCore: {outputs.scheduler_stats=}, {iteration_stats=}, {processed_outputs.request_outputs=}"

Check failure on line 299 in vllm/v1/engine/llm_engine.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/v1/engine/llm_engine.py:299:81: E501 Line too long (111 > 80)

Check failure on line 299 in vllm/v1/engine/llm_engine.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

vllm/v1/engine/llm_engine.py:299:13: G004 Logging statement uses f-string
)
return processed_outputs.request_outputs

def get_vllm_config(self):
Expand Down
9 changes: 9 additions & 0 deletions vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import torch

from vllm.logger import init_logger
from vllm.outputs import (CompletionOutput, PoolingOutput,
PoolingRequestOutput, RequestOutput)
from vllm.sampling_params import RequestOutputKind
Expand All @@ -22,6 +23,8 @@
from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates,
RequestStateStats)

logger = init_logger(__name__)


class RequestOutputCollector:
"""
Expand Down Expand Up @@ -190,6 +193,9 @@
kv_transfer_params: Optional[dict[str, Any]] = None,
) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:

logger.info(
f"{self.request_id=}, {self.output_kind=}, {finish_reason=}, {stop_reason=}"

Check failure on line 197 in vllm/v1/engine/output_processor.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

vllm/v1/engine/output_processor.py:197:81: E501 Line too long (88 > 80)

Check failure on line 197 in vllm/v1/engine/output_processor.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

vllm/v1/engine/output_processor.py:197:13: G004 Logging statement uses f-string
)
finished = finish_reason is not None
final_only = self.output_kind == RequestOutputKind.FINAL_ONLY

Expand Down Expand Up @@ -446,6 +452,9 @@
if request_output := req_state.make_request_output(
new_token_ids, pooling_output, finish_reason, stop_reason,
kv_transfer_params):
assert isinstance(request_output, RequestOutput)
logger.info(
f"Request {req_id} metrics: {request_output.metrics}")

Check failure on line 457 in vllm/v1/engine/output_processor.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (G004)

vllm/v1/engine/output_processor.py:457:21: G004 Logging statement uses f-string
if req_state.queue is not None:
# AsyncLLM: put into queue for handling by generate().
req_state.queue.put(request_output)
Expand Down
Loading