vllm-project · mgoin · Jun 11, 2025 · Jun 26, 2025 · gemini-code-assist · Jun 11, 2025
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -1315,6 +1315,37 @@ def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup],
     return [x == 1 for x in aggregated_data.tolist()]
 
 
+def is_global_first_rank() -> bool:
+    """
+    Check if the current process is the first rank globally across all 
+    parallelism strategies (PP, TP, DP, EP, etc.).
+
+    Unlike group-specific checks like `get_tensor_model_parallel_rank() == 0`
+    or `get_pp_group().is_first_rank`, this function checks the global rank
+    across all parallelism dimensions.
+
+    Returns:
+        bool: True if this is the global first rank (rank 0), False otherwise.
+              Returns True if distributed is not initialized (single process).
+    """
+    try:
+        # If world group is available, use it for the most accurate check
+        global _WORLD
+        if _WORLD is not None:
+            return _WORLD.is_first_rank
+
+        # If torch distributed is not initialized, assume single process
+        if not torch.distributed.is_initialized():
+            return True
+
+        # Fallback to torch's global rank
+        return torch.distributed.get_rank() == 0
+
+    except Exception:
+        # If anything goes wrong, assume this is the first rank
+        return True
-    except Exception:
-        # If anything goes wrong, assume this is the first rank
-        return True
+    except Exception as e:  # Capture the specific exception instance.
+        # Consider logging 'e' for debugging to understand potential failures.
+        # For example, if a logger is configured:
+        # logger.warning("is_global_first_rank() encountered an error, defaulting to True: %s", e, exc_info=True)
+        # If anything goes wrong, assume this is the first rank
+        return True
-    except Exception:
-        # If anything goes wrong, assume this is the first rank
-        return True
+    except Exception as e:  # Capture the specific exception instance.
+        # Consider logging 'e' for debugging to understand potential failures.
+        # For example, if a logger is configured:
+        # logger.warning("is_global_first_rank() encountered an error, defaulting to True: %s", e, exc_info=True)
+        # If anything goes wrong, assume this is the first rank
+        return True
+
+
 def _node_count(pg: Union[ProcessGroup, StatelessProcessGroup]) -> int:
     """
     Returns the total number of nodes in the process group.

@@ -25,7 +25,7 @@
                                           has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.distributed.parallel_state import (
-    get_pp_group, get_tp_group, graph_capture,
+    get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
     prepare_communication_buffer_for_model)
 from vllm.forward_context import (DPMetadata, get_forward_context,
                                   set_forward_context)
@@ -2207,9 +2207,12 @@ def capture_model(self) -> None:
         # can reuse the memory pool allocated for the large shapes.
         with graph_capture(device=self.device):
             full_cg = self.full_cuda_graph
-            for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
-                                   desc="Capturing CUDA graphs",
-                                   total=len(self.cudagraph_batch_sizes)):
+            # Only rank 0 should print progress bar during capture
+            compilation_cases = reversed(self.cudagraph_batch_sizes)
+            if is_global_first_rank():
+                compilation_cases = tqdm(list(compilation_cases),
+                                         desc="Capturing CUDA graph shapes")
+            for num_tokens in compilation_cases:
                 for _ in range(
                         self.compilation_config.cudagraph_num_of_warmups):
                     self._dummy_run(num_tokens, capture_attn_cudagraph=full_cg)