remove some output

blinkbear · May 26, 2024 · 4c11c9a · 4c11c9a
1 parent fed43a3
commit 4c11c9a
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 4 deletions.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -101,7 +101,7 @@ def __init__(
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
     ) -> None:
-        logger.info(
+        logger.debug(
             "Initializing an LLM engine (v%s) with config: "
             "model=%r, speculative_config=%r, tokenizer=%r, "
             "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
@@ -13,6 +13,7 @@
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            PromptLogprobs, SampleLogprobs, SamplerOutput,
                            SequenceOutput)
+import time
 
 # (num_token_ids, num_parent_ids) per sequence group.
 SampleResultType = List[Tuple[List[int], List[int]]]
@@ -160,6 +161,7 @@ def _apply_min_tokens_penalty(
     # list of indices in logits that will be set to -inf
     logits_to_penalize: List[Tuple[int, int]] = []
     logits_applied = 0
+    st = time.time()
     for seq_group in sampling_metadata.seq_groups:
         seq_ids = seq_group.seq_ids
         sampling_params = seq_group.sampling_params
@@ -186,11 +188,16 @@ def _apply_min_tokens_penalty(
                 # itertools.product pairs each seq index with every token id
                 logits_to_penalize.extend(
                     itertools.product(seqs_to_penalize, token_ids_to_penalize))
+    et =time.time()
+    print(f"traverse logits time = {et - st}")
 
+    st = time.time()
     if logits_to_penalize:
         # use zip and * to group indices along each dimension
         # eg. [ (1,2), (1,3), (5,6) ] -> ( (1,1,5), (2,3,6) )
         logits[tuple(zip(*logits_to_penalize))] = -float("inf")
+    et =time.time()
+    print(f"apply to logits applied time = {et - st}")
 
     # verifies that no rows in logits were missed unexpectedly
     assert logits_applied == logits.shape[0]
@@ -1009,6 +1016,8 @@ def _build_sampler_output(
     """
 
     sampler_output = []
+    import time
+    st = time.time()
     for (seq_group, sample_result, group_prompt_logprobs,
          group_sample_logprobs) in zip(sampling_metadata.seq_groups,
                                        sample_results, prompt_logprobs,
@@ -1024,6 +1033,8 @@ def _build_sampler_output(
         sampler_output.append(
             CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs))
 
+    et = time.time()
+    print(f"build_sampler_output {et - st}")
     # If not specified, store None values in SamplerOutput.
     if on_device_tensors is not None:
         (sampled_token_probs, logprobs_tensor,

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -829,11 +829,11 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         per sequence in the batch.
         """
         assert not self.model_config.enforce_eager
-        logger.info("Capturing the model for CUDA graphs. This may lead to "
+        logger.debug("Capturing the model for CUDA graphs. This may lead to "
                     "unexpected consequences if the model is not static. To "
                     "run the model in eager mode, set 'enforce_eager=True' or "
                     "use '--enforce-eager' in the CLI.")
-        logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
+        logger.debug("CUDA graphs can take additional 1~3 GiB memory per GPU. "
                     "If you are running out of memory, consider decreasing "
                     "`gpu_memory_utilization` or enforcing eager mode. "
                     "You can also reduce the `max_num_seqs` as needed "
@@ -899,7 +899,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         end_time = time.perf_counter()
         elapsed_time = end_time - start_time
         # This usually takes < 10 seconds.
-        logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
+        logger.debug("Graph capturing finished in %.0f secs.", elapsed_time)
 
     @property
     def vocab_size(self) -> int: