vllm-project · russellb · Jun 11, 2025 · Jun 11, 2025 · mgoin · Jun 11, 2025
@@ -12,6 +12,7 @@
 import torch
 import torch.distributed
 import torch.nn as nn
+from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm.attention import AttentionType, get_attn_backend
@@ -2034,7 +2035,9 @@ def capture_model(self) -> None:
         # can reuse the memory pool allocated for the large shapes.
         with graph_capture(device=self.device):
             skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
-            for num_tokens in reversed(self.cudagraph_batch_sizes):
+            for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
 # Only rank 0 should print progress bar during capture 
 if get_tensor_model_parallel_rank() == 0: 
     compilation_cases = tqdm( 
         list(compilation_cases), 
         desc="Capturing CUDA graph shapes") 
 # Only rank 0 should print progress bar during capture 
 if get_tensor_model_parallel_rank() == 0: 
     compilation_cases = tqdm( 
         list(compilation_cases), 
         desc="Capturing CUDA graph shapes") 
+                                   desc="Capturing CUDA graphs",
+                                   total=len(self.cudagraph_batch_sizes)):
-            for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
-                                   desc="Capturing CUDA graphs",
-                                   total=len(self.cudagraph_batch_sizes)):
+            # Determine if tqdm should be disabled based on log level
+            # (add `import logging` at the top of the file if not already present)
+            effective_log_level = logger.getEffectiveLevel()
+            # Disable tqdm if logging level is WARNING or higher. Adjust as needed.
+            disable_tqdm = effective_log_level >= logging.WARNING
+
+            for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
+                                   desc="Capturing CUDA graphs",
+                                   total=len(self.cudagraph_batch_sizes),
+                                   disable=disable_tqdm):
-            for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
-                                   desc="Capturing CUDA graphs",
-                                   total=len(self.cudagraph_batch_sizes)):
+            # Determine if tqdm should be disabled based on log level
+            # (add `import logging` at the top of the file if not already present)
+            effective_log_level = logger.getEffectiveLevel()
+            # Disable tqdm if logging level is WARNING or higher. Adjust as needed.
+            disable_tqdm = effective_log_level >= logging.WARNING
+
+            for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
+                                   desc="Capturing CUDA graphs",
+                                   total=len(self.cudagraph_batch_sizes),
+                                   disable=disable_tqdm):
                 for _ in range(self.vllm_config.compilation_config.
                                cudagraph_num_of_warmups):
                     self._dummy_run(num_tokens, skip_attn=skip_attn)