modify num-of-microbatches to ubatch-size

jiangkuaixue123 · jiangkuaixue123 · commit 1599422f8f05 · 2025-12-08T21:00:34.000+08:00
Signed-off-by: jiangkuaixue123 &lt;jiangxiaozhou111@163.com&gt;
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
@@ -93,7 +93,7 @@ def parse_args():
         help=("Enable microbatched execution"),
     )
     parser.add_argument(
-        "--num-of-microbatches",
+        "--ubatch-size",
         type=int,
         default=2,
         help=("Number of microbatches. Requires --enable-dbo to be enabled."),
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
@@ -156,8 +156,8 @@ class ParallelConfig:
 
     enable_dbo: bool = False
     """Enable dual batch overlap for the model executor."""
-    num_of_microbatches: int = 2
-    """Number of microbatches. Requires --enable-dbo to be enabled."""
+    ubatch_size: int = 0
+    """Number of ubatch size."""
 
     dbo_decode_token_threshold: int = 32
     """The threshold for dual batch overlap for batches only containing decodes.
@@ -330,6 +330,14 @@ def world_size_across_dp(self) -> int:
         """world_size_across_dp is TPxPPxDP, it is the size of the world
         including data parallelism."""
         return self.world_size * self.data_parallel_size
+    
+    @property
+    def use_ubatching(self) -> bool:
+        return self.enable_dbo or self.ubatch_size > 1
+    
+    @property
+    def num_of_ubatches(self) -> int:
+        return 2 if self.enable_dbo else self.ubatch_size
 
     def get_next_dp_init_port(self) -> int:
         """
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
@@ -851,7 +851,7 @@ def has_blocked_weights():
                     f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                 )
 
-        if self.parallel_config.enable_dbo:
+        if self.parallel_config.use_ubatching:
             a2a_backend = self.parallel_config.all2all_backend
             assert a2a_backend in [
                 "deepep_low_latency",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -409,7 +409,7 @@ class EngineArgs:
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     all2all_backend: str | None = ParallelConfig.all2all_backend
     enable_dbo: bool = ParallelConfig.enable_dbo
-    num_of_microbatches: int = ParallelConfig.num_of_microbatches
+    ubatch_size: int = ParallelConfig.ubatch_size
     dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
     dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
     disable_nccl_for_dp_synchronization: bool = (
@@ -830,8 +830,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         parallel_group.add_argument("--enable-dbo", **parallel_kwargs["enable_dbo"])
         parallel_group.add_argument(
-            "--num-of-microbatches",
-            **parallel_kwargs["num_of_microbatches"],
+            "--ubatch-size",
+            **parallel_kwargs["ubatch_size"],
         )
         parallel_group.add_argument(
             "--dbo-decode-token-threshold",
@@ -1607,7 +1607,7 @@ def create_engine_config(
             enable_expert_parallel=self.enable_expert_parallel,
             all2all_backend=self.all2all_backend,
             enable_dbo=self.enable_dbo,
-            num_of_microbatches=self.num_of_microbatches,
+            ubatch_size=self.ubatch_size,
             dbo_decode_token_threshold=self.dbo_decode_token_threshold,
             dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
             disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization,
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -23,7 +23,7 @@
     dbo_yield_and_switch_from_comm_to_compute,
     dbo_yield_and_switch_from_compute_to_comm,
 )
-
+from typing import Any
 
 class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     """
@@ -63,7 +63,7 @@ def __init__(
         # The dispatch function returns a handle that the combine function
         # requires. Under DBO microbatching we must track one handle per
         # micro-batch to avoid races between threads.
-        self.handles = []
+        self.handles: list[Any | None] = []
 
         # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164
         self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160]
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1502,9 +1502,10 @@ def ensure_dp_chunking_init(self):
 
         moe = self.moe_config
 
-        if self.vllm_config.parallel_config.enable_dbo:
-            states_shape = (2, moe.max_num_tokens, self.hidden_size)
-            logits_shape = (2, moe.max_num_tokens, self.logical_num_experts)
+        if self.vllm_config.parallel_config.use_ubatching:
+            num_of_ubatches = self.vllm_config.parallel_config.num_of_ubatches
+            states_shape = (num_of_ubatches, moe.max_num_tokens, self.hidden_size)
+            logits_shape = (num_of_ubatches, moe.max_num_tokens, self.logical_num_experts)
         else:
             states_shape = (moe.max_num_tokens, self.hidden_size)
             logits_shape = (moe.max_num_tokens, self.logical_num_experts)
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
@@ -132,7 +132,7 @@ def _synchronize_dp_ranks(
     assert should_attempt_dp_padding == should_dp_pad
 
     # Check conditions for microbatching
-    should_ubatch = _post_process_ubatch(tensor, parallel_config.num_of_microbatches)
+    should_ubatch = _post_process_ubatch(tensor, parallel_config.num_of_ubatches)
 
     if should_ubatch and not should_dp_pad:
         logger.debug_once(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2910,7 +2910,7 @@ def execute_model(
 
                 cascade_attn_prefix_lens = None
                 # Disable cascade attention when using microbatching (DBO)
-                if self.cascade_attn_enabled and not self.parallel_config.enable_dbo:
+                if self.cascade_attn_enabled and not self.parallel_config.use_ubatching:
                     # Pre-compute cascade attention prefix lengths
                     cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
                         num_scheduled_tokens_np,
@@ -2950,11 +2950,11 @@ def execute_model(
                     num_scheduled_tokens_np,
                     num_tokens_padded,
                     num_reqs_padded,
-                    self.parallel_config.num_of_microbatches,
+                    self.parallel_config.num_of_ubatches,
                 )
 
-                logger.info(
-                    "jcz ubatch_slices: %s, ubatch_slices_padded: %s",
+                logger.debug(
+                    "ubatch_slices: %s, ubatch_slices_padded: %s",
                     ubatch_slices,
                     ubatch_slices_padded,
                 )
@@ -3624,11 +3624,11 @@ def load_model(self, eep_scale_up: bool = False) -> None:
         # wrap the model with full cudagraph wrapper if needed.
         cudagraph_mode = self.compilation_config.cudagraph_mode
         assert cudagraph_mode is not None
-        if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.enable_dbo:
+        if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.use_ubatching:
             self.model = CUDAGraphWrapper(
                 self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
             )
-        elif self.parallel_config.enable_dbo:
+        elif self.parallel_config.use_ubatching:
             if cudagraph_mode.has_full_cudagraphs():
                 self.model = UBatchWrapper(
                     self.model, self.vllm_config, CUDAGraphMode.FULL, self.device
@@ -3999,10 +3999,10 @@ def _dummy_run(
             num_scheduled_tokens,
             num_tokens_padded,
             num_reqs_padded,
-            self.vllm_config.parallel_config.num_of_microbatches,
+            self.vllm_config.parallel_config.num_of_ubatches,
         )
-        logger.info(
-            "jcz ubatch_slices: %s, ubatch_slices_padded: %s",
+        logger.debug(
+            "ubatch_slices: %s, ubatch_slices_padded: %s",
             ubatch_slices,
             ubatch_slices_padded,
         )
@@ -4529,8 +4529,8 @@ def _capture_cudagraphs(
             # is above the threshold. Otherwise we just capture a non-ubatched
             # version of the graph
             allow_microbatching = (
-                self.parallel_config.enable_dbo
-                and self.parallel_config.num_of_microbatches > 1
+                self.parallel_config.use_ubatching
+                and self.parallel_config.num_of_ubatches > 1
                 and cudagraph_runtime_mode == CUDAGraphMode.FULL
                 and uniform_decode
                 and check_ubatch_thresholds(
@@ -4662,8 +4662,8 @@ def initialize_metadata_builders(
                     if kv_cache_group_id < len(kernel_block_sizes)
                     else None,
                     num_metadata_builders=1
-                    if not self.parallel_config.enable_dbo
-                    else self.parallel_config.num_of_microbatches,
+                    if not self.parallel_config.use_ubatching
+                    else self.parallel_config.num_of_ubatches,
                 )
         # Calculate reorder batch threshold (if needed)
         # Note (tdoublep): do this *after* constructing builders,
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -105,7 +105,7 @@ def __init__(
         self.comm_stream = torch.cuda.Stream(device=device)
         # Ubatch threads plus the main thread
         self.ready_barrier = threading.Barrier(
-            self.vllm_config.parallel_config.num_of_microbatches + 1
+            self.vllm_config.parallel_config.num_of_ubatches + 1
         )
 
         self.cudagraphs: dict[int, CUDAGraphMetaData] = {}
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
@@ -38,7 +38,7 @@ def is_last_ubatch_empty(
 def check_ubatch_thresholds(
     config: ParallelConfig, num_tokens: int, uniform_decode: bool
 ) -> bool:
-    if not config.enable_dbo:
+    if not config.use_ubatching:
         return False
     if uniform_decode:
         return num_tokens >= config.dbo_decode_token_threshold

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ def parse_args():`
`93`	`93`	`help=("Enable microbatched execution"),`
`94`	`94`	`)`
`95`	`95`	`parser.add_argument(`
`96`		`- "--num-of-microbatches",`
	`96`	`+ "--ubatch-size",`
`97`	`97`	`type=int,`
`98`	`98`	`default=2,`
`99`	`99`	`help=("Number of microbatches. Requires --enable-dbo to be enabled."),`
Original file line number	Diff line number	Diff line change
`@@ -851,7 +851,7 @@ def has_blocked_weights():`
`851`	`851`	`f"cudagraph_mode={self.compilation_config.cudagraph_mode}"`
`852`	`852`	`)`
`853`	`853`
`854`		`- if self.parallel_config.enable_dbo:`
	`854`	`+ if self.parallel_config.use_ubatching:`
`855`	`855`	`a2a_backend = self.parallel_config.all2all_backend`
`856`	`856`	`assert a2a_backend in [`
`857`	`857`	`"deepep_low_latency",`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ def __init__(`
`105`	`105`	`self.comm_stream = torch.cuda.Stream(device=device)`
`106`	`106`	`# Ubatch threads plus the main thread`
`107`	`107`	`self.ready_barrier = threading.Barrier(`
`108`		`- self.vllm_config.parallel_config.num_of_microbatches + 1`
	`108`	`+ self.vllm_config.parallel_config.num_of_ubatches + 1`
`109`	`109`	`)`
`110`	`110`
`111`	`111`	`self.cudagraphs: dict[int, CUDAGraphMetaData] = {}`