File tree Expand file tree Collapse file tree 1 file changed +9
-2
lines changed Expand file tree Collapse file tree 1 file changed +9
-2
lines changed Original file line number Diff line number Diff line change @@ -2238,9 +2238,16 @@ def capture_model(self) -> None:
2238
2238
# to capture the attention for the mix prefill-decode (general) phase,
2239
2239
# based on the attention backends.
2240
2240
capture_attn_cudagraph_general = "auto" if full_cg else False
2241
-
2241
+
2242
+ # Skip capturing batch sizes of 1 in mix prefill-decode if
2243
+ # separate_attention_routine is on. As bs=1 can treat as a
2244
+ # pure decode.
2245
+ start_idx = 1 if self .vllm_config .compilation_config .separate_attention_routine \
2246
+ and len (self .cudagraph_batch_sizes ) > 0 and self .cudagraph_batch_sizes [0 ] == 1 \
2247
+ else 0
2248
+
2242
2249
# Capture the mix prefill-decode (general usage) cudagraphs
2243
- for num_tokens in tqdm (reversed (self .cudagraph_batch_sizes ),
2250
+ for num_tokens in tqdm (reversed (self .cudagraph_batch_sizes [ start_idx :] ),
2244
2251
desc = "Capturing CUDA graphs (mix prefill-decode)" ,
2245
2252
total = len (self .cudagraph_batch_sizes )):
2246
2253
for _ in range (
You can’t perform that action at this time.
0 commit comments