Skip to content

Commit 9038647

Browse files
committed
Minor adjustment of capturing size
1 parent b5f6627 commit 9038647

File tree

1 file changed

+9
-2
lines changed

1 file changed

+9
-2
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2238,9 +2238,16 @@ def capture_model(self) -> None:
22382238
# to capture the attention for the mix prefill-decode (general) phase,
22392239
# based on the attention backends.
22402240
capture_attn_cudagraph_general = "auto" if full_cg else False
2241-
2241+
2242+
# Skip capturing batch sizes of 1 in mix prefill-decode if
2243+
# separate_attention_routine is on. As bs=1 can treat as a
2244+
# pure decode.
2245+
start_idx = 1 if self.vllm_config.compilation_config.separate_attention_routine \
2246+
and len(self.cudagraph_batch_sizes) > 0 and self.cudagraph_batch_sizes[0] == 1 \
2247+
else 0
2248+
22422249
# Capture the mix prefill-decode (general usage) cudagraphs
2243-
for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
2250+
for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes[start_idx:]),
22442251
desc="Capturing CUDA graphs (mix prefill-decode)",
22452252
total=len(self.cudagraph_batch_sizes)):
22462253
for _ in range(

0 commit comments

Comments
 (0)