From 4c11c9ad495cc43f99d3e68acf91cb090f644555 Mon Sep 17 00:00:00 2001 From: blinkbear Date: Sun, 26 May 2024 09:43:36 +0000 Subject: [PATCH] remove some output --- vllm/engine/llm_engine.py | 2 +- vllm/model_executor/layers/sampler.py | 11 +++++++++++ vllm/worker/model_runner.py | 6 +++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 692dd3a906304..56e107306f43f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -101,7 +101,7 @@ def __init__( log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, ) -> None: - logger.info( + logger.debug( "Initializing an LLM engine (v%s) with config: " "model=%r, speculative_config=%r, tokenizer=%r, " "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index a84f562909d50..849fee6a11483 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -13,6 +13,7 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceOutput) +import time # (num_token_ids, num_parent_ids) per sequence group. SampleResultType = List[Tuple[List[int], List[int]]] @@ -160,6 +161,7 @@ def _apply_min_tokens_penalty( # list of indices in logits that will be set to -inf logits_to_penalize: List[Tuple[int, int]] = [] logits_applied = 0 + st = time.time() for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids sampling_params = seq_group.sampling_params @@ -186,11 +188,16 @@ def _apply_min_tokens_penalty( # itertools.product pairs each seq index with every token id logits_to_penalize.extend( itertools.product(seqs_to_penalize, token_ids_to_penalize)) + et =time.time() + print(f"traverse logits time = {et - st}") + st = time.time() if logits_to_penalize: # use zip and * to group indices along each dimension # eg. [ (1,2), (1,3), (5,6) ] -> ( (1,1,5), (2,3,6) ) logits[tuple(zip(*logits_to_penalize))] = -float("inf") + et =time.time() + print(f"apply to logits applied time = {et - st}") # verifies that no rows in logits were missed unexpectedly assert logits_applied == logits.shape[0] @@ -1009,6 +1016,8 @@ def _build_sampler_output( """ sampler_output = [] + import time + st = time.time() for (seq_group, sample_result, group_prompt_logprobs, group_sample_logprobs) in zip(sampling_metadata.seq_groups, sample_results, prompt_logprobs, @@ -1024,6 +1033,8 @@ def _build_sampler_output( sampler_output.append( CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs)) + et = time.time() + print(f"build_sampler_output {et - st}") # If not specified, store None values in SamplerOutput. if on_device_tensors is not None: (sampled_token_probs, logprobs_tensor, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 87d5f5c1b9d67..7143bacbe40fe 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -829,11 +829,11 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: per sequence in the batch. """ assert not self.model_config.enforce_eager - logger.info("Capturing the model for CUDA graphs. This may lead to " + logger.debug("Capturing the model for CUDA graphs. This may lead to " "unexpected consequences if the model is not static. To " "run the model in eager mode, set 'enforce_eager=True' or " "use '--enforce-eager' in the CLI.") - logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. " + logger.debug("CUDA graphs can take additional 1~3 GiB memory per GPU. " "If you are running out of memory, consider decreasing " "`gpu_memory_utilization` or enforcing eager mode. " "You can also reduce the `max_num_seqs` as needed " @@ -899,7 +899,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: end_time = time.perf_counter() elapsed_time = end_time - start_time # This usually takes < 10 seconds. - logger.info("Graph capturing finished in %.0f secs.", elapsed_time) + logger.debug("Graph capturing finished in %.0f secs.", elapsed_time) @property def vocab_size(self) -> int: