Skip to content

Commit

Permalink
remove some output
Browse files Browse the repository at this point in the history
  • Loading branch information
blinkbear committed May 26, 2024
1 parent fed43a3 commit 4c11c9a
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 4 deletions.
2 changes: 1 addition & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __init__(
log_stats: bool,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
) -> None:
logger.info(
logger.debug(
"Initializing an LLM engine (v%s) with config: "
"model=%r, speculative_config=%r, tokenizer=%r, "
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
Expand Down
11 changes: 11 additions & 0 deletions vllm/model_executor/layers/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
PromptLogprobs, SampleLogprobs, SamplerOutput,
SequenceOutput)
import time

# (num_token_ids, num_parent_ids) per sequence group.
SampleResultType = List[Tuple[List[int], List[int]]]
Expand Down Expand Up @@ -160,6 +161,7 @@ def _apply_min_tokens_penalty(
# list of indices in logits that will be set to -inf
logits_to_penalize: List[Tuple[int, int]] = []
logits_applied = 0
st = time.time()
for seq_group in sampling_metadata.seq_groups:
seq_ids = seq_group.seq_ids
sampling_params = seq_group.sampling_params
Expand All @@ -186,11 +188,16 @@ def _apply_min_tokens_penalty(
# itertools.product pairs each seq index with every token id
logits_to_penalize.extend(
itertools.product(seqs_to_penalize, token_ids_to_penalize))
et =time.time()
print(f"traverse logits time = {et - st}")

st = time.time()
if logits_to_penalize:
# use zip and * to group indices along each dimension
# eg. [ (1,2), (1,3), (5,6) ] -> ( (1,1,5), (2,3,6) )
logits[tuple(zip(*logits_to_penalize))] = -float("inf")
et =time.time()
print(f"apply to logits applied time = {et - st}")

# verifies that no rows in logits were missed unexpectedly
assert logits_applied == logits.shape[0]
Expand Down Expand Up @@ -1009,6 +1016,8 @@ def _build_sampler_output(
"""

sampler_output = []
import time
st = time.time()
for (seq_group, sample_result, group_prompt_logprobs,
group_sample_logprobs) in zip(sampling_metadata.seq_groups,
sample_results, prompt_logprobs,
Expand All @@ -1024,6 +1033,8 @@ def _build_sampler_output(
sampler_output.append(
CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs))

et = time.time()
print(f"build_sampler_output {et - st}")
# If not specified, store None values in SamplerOutput.
if on_device_tensors is not None:
(sampled_token_probs, logprobs_tensor,
Expand Down
6 changes: 3 additions & 3 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,11 +829,11 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
per sequence in the batch.
"""
assert not self.model_config.enforce_eager
logger.info("Capturing the model for CUDA graphs. This may lead to "
logger.debug("Capturing the model for CUDA graphs. This may lead to "
"unexpected consequences if the model is not static. To "
"run the model in eager mode, set 'enforce_eager=True' or "
"use '--enforce-eager' in the CLI.")
logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
logger.debug("CUDA graphs can take additional 1~3 GiB memory per GPU. "
"If you are running out of memory, consider decreasing "
"`gpu_memory_utilization` or enforcing eager mode. "
"You can also reduce the `max_num_seqs` as needed "
Expand Down Expand Up @@ -899,7 +899,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
end_time = time.perf_counter()
elapsed_time = end_time - start_time
# This usually takes < 10 seconds.
logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
logger.debug("Graph capturing finished in %.0f secs.", elapsed_time)

@property
def vocab_size(self) -> int:
Expand Down

0 comments on commit 4c11c9a

Please sign in to comment.