vllm-project
diff --git a/‎examples/offline_inference/data_parallel.py‎
Lines changed: 8 additions & 0 deletions b/‎examples/offline_inference/data_parallel.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎tests/v1/attention/test_attention_splitting.py‎
Lines changed: 5 additions & 5 deletions b/‎tests/v1/attention/test_attention_splitting.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tests/v1/spec_decode/test_eagle.py‎
Lines changed: 6 additions & 2 deletions b/‎tests/v1/spec_decode/test_eagle.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎vllm/config/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎vllm/config/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 8 additions & 0 deletions b/‎vllm/config/parallel.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 0 additions & 5 deletions b/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 10 additions & 0 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎vllm/forward_context.py‎
Lines changed: 101 additions & 20 deletions b/‎vllm/forward_context.py‎
Lines changed: 101 additions & 20 deletions
@@ -87,6 +87,11 @@ def parse_args():
         default=0.8,
         help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
     )
+    parser.add_argument(
+        "--enable-dbo",
+        action="store_true",
+        help=("Enable microbatched execution"),
+    )
     parser.add_argument(
         "--compilation-config",
         type=int,
@@ -113,6 +118,7 @@ def main(
     max_model_len,
     compilation_config,
     gpu_memory_utilization,
+    enable_dbo,
     quantization,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
@@ -167,6 +173,7 @@ def start(rank):
         max_num_seqs=max_num_seqs,
         max_model_len=max_model_len,
         gpu_memory_utilization=gpu_memory_utilization,
+        enable_dbo=enable_dbo,
         quantization=quantization,
         compilation_config=compilation_config,
     )
@@ -227,6 +234,7 @@ def start(rank):
                 args.max_model_len,
                 args.compilation_config,
                 args.gpu_memory_utilization,
+                args.enable_dbo,
                 args.quantization,
             ),
         )
 
@@ -6,7 +6,7 @@
 
 from tests.v1.attention.test_attention_backends import BATCH_SPECS
 from tests.v1.attention.utils import create_common_attn_metadata
-from vllm.v1.attention.backends.utils import (UbatchSlice,
+from vllm.v1.attention.backends.utils import (UBatchSlice,
                                               _make_metadata_with_slice,
                                               slice_query_start_locs,
                                               split_attn_metadata)
@@ -106,7 +106,7 @@ def mixed_small_metadata():
 def test_make_metadata_with_slice_decode_batch(small_decode_metadata):
     """Test slicing decode batch metadata"""
     # Split first request only
-    ubatch_slice = UbatchSlice(slice(0, 1), slice(0, 1))
+    ubatch_slice = UBatchSlice(slice(0, 1), slice(0, 1))
 
     result = _make_metadata_with_slice(ubatch_slice, small_decode_metadata)
 
@@ -120,7 +120,7 @@ def test_make_metadata_with_slice_decode_batch(small_decode_metadata):
 
 def test_make_metadata_with_slice_mixed_batch(mixed_small_metadata):
     """Test slicing mixed batch metadata"""
-    ubatch_slice = UbatchSlice(slice(1, 3),
+    ubatch_slice = UBatchSlice(slice(1, 3),
                                slice(1, 7))  # Requests 1-3, tokens 1-7
 
     result = _make_metadata_with_slice(ubatch_slice, mixed_small_metadata)
@@ -137,8 +137,8 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata):
     num_tokens = large_decode_metadata.num_reqs
     mid_point = num_tokens // 2
     ubatch_slices = [
-        UbatchSlice(slice(0, mid_point), slice(0, mid_point)),
-        UbatchSlice(slice(mid_point, num_tokens), slice(mid_point,
+        UBatchSlice(slice(0, mid_point), slice(0, mid_point)),
+        UBatchSlice(slice(mid_point, num_tokens), slice(mid_point,
                                                         num_tokens)),
     ]
 
 
@@ -365,7 +365,9 @@ def create_deterministic_logits(token_ids):
     # Mock runner for attention metadata building
     proposer.runner = mock.MagicMock()
     proposer.runner.attn_groups.append([mock.MagicMock()])
-    proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder
+    proposer.runner.attn_groups[0][0].metadata_builders = [
+        attn_metadata_builder
+    ]
 
     result = proposer.propose(target_token_ids=target_token_ids,
                               target_positions=target_positions,
@@ -489,7 +491,9 @@ def create_deterministic_logits(token_ids, k: int):
     # Mock runner for attention metadata building.
     proposer.runner = mock.MagicMock()
     proposer.runner.attn_groups.append([mock.MagicMock()])
-    proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder
+    proposer.runner.attn_groups[0][0].metadata_builders = [
+        attn_metadata_builder
+    ]
 
     # Setup inputs for the proposer.
     target_token_ids = torch.randint(0,
 
@@ -2848,6 +2848,14 @@ def __post_init__(self):
                     "when cudagraph_mode piecewise cudagraphs is used, "\
                     f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
 
+        if self.parallel_config.enable_dbo:
+            a2a_backend = envs.VLLM_ALL2ALL_BACKEND
+            assert a2a_backend == "deepep_low_latency", \
+            "Microbatching currently only supports the deepep_low_latency "\
+            f"all2all backend. {a2a_backend} is not supported. To fix set "\
+            "the VLLM_ALL2ALL_BACKEND environment variable to "\
+            "deepep_low_latency and install the DeepEP kerenls."
+
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
 
@@ -137,6 +137,14 @@ class ParallelConfig:
     disable_custom_all_reduce: bool = False
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
+    enable_dbo: bool = False
+    """Enable microbatching for the model executor."""
+
+    dbo_decode_token_threshold: int = 32
+    """The threshold for microbatching. If the number of tokens in the
+    request is greater than this threshold, microbatching will be used.
+    Otherwise, the request will be processed in a single batch."""
+
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
 
@@ -251,9 +251,4 @@ def get_handle(self, kwargs):
         logger.debug("DeepEP all2all args %s", buffer_kwargs)
         handle: deep_ep.Buffer = self.handle_cache.get_or_create(
             buffer_kwargs, deep_ep.Buffer)
-        # It is dangerous to set num sms outside this function. num_sms is not
-        # a part of the hash-key that identifies this object. If we are in a
-        # situation where we make objects with different num_sms, the hash key
-        # in get_or_create must be updated.
-        handle.set_num_sms(self.num_sms)
         return handle
@@ -327,6 +327,9 @@ class EngineArgs:
     data_parallel_hybrid_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
+    enable_dbo: bool = ParallelConfig.enable_dbo
+    dbo_decode_token_threshold: int = \
+        ParallelConfig.dbo_decode_token_threshold
     eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
     enable_eplb: bool = ParallelConfig.enable_eplb
     expert_placement_strategy: ExpertPlacementStrategy = \
@@ -695,6 +698,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parallel_group.add_argument(
             "--enable-expert-parallel",
             **parallel_kwargs["enable_expert_parallel"])
+        parallel_group.add_argument("--enable-dbo",
+                                    **parallel_kwargs["enable_dbo"])
+        parallel_group.add_argument(
+            "--dbo-decode-token-threshold",
+            **parallel_kwargs["dbo_decode_token_threshold"])
         parallel_group.add_argument("--enable-eplb",
                                     **parallel_kwargs["enable_eplb"])
         parallel_group.add_argument("--eplb-config",
@@ -1339,6 +1347,8 @@ def create_engine_config(
             data_parallel_backend=self.data_parallel_backend,
             data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
             enable_expert_parallel=self.enable_expert_parallel,
+            enable_dbo=self.enable_dbo,
+            dbo_decode_token_threshold=self.dbo_decode_token_threshold,
             enable_eplb=self.enable_eplb,
             eplb_config=self.eplb_config,
             expert_placement_strategy=self.expert_placement_strategy,
 
@@ -14,6 +14,7 @@
 from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.v1.worker.ubatch_utils import UBatchSlices, is_second_ubatch_empty
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
@@ -97,6 +98,53 @@ def num_tokens_across_dp(num_tokens: int, dp_size: int,
         dist.all_reduce(num_tokens_tensor, group=group)
         return num_tokens_tensor.cpu()
 
+    @staticmethod
+    def should_ubatch_across_dp(
+            should_ubatch: bool, orig_num_tokens_per_ubatch: int,
+            padded_num_tokens_per_ubatch: int, dp_size: int,
+            dp_rank: int) -> tuple[bool, Optional[torch.Tensor]]:
+        """
+        1. Decides if each DP rank is going to microbatch. Either all ranks
+        run with microbatching or none of them do. If this function decides
+        not to run with microbatching. It will "abort" meaning that no padding
+        information will be returned to the caller. It will return (False, None)
+
+        2. Determines the total number of tokens that each rank will run.
+        All ranks will be padded out so that the run with the same number
+        of tokens
+
+        Returns: tuple[
+            should_ubatch: Are all DP ranks going to microbatch
+            num_tokens_after_padding: A tensor containing the total number of
+            tokens per-microbatch for each DP rank including padding. Will be
+            None if should_ubatch if False
+        ]
+        """
+
+        device = current_platform.device_type
+        tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32)
+        tensor[0][dp_rank] = orig_num_tokens_per_ubatch
+        tensor[1][dp_rank] = padded_num_tokens_per_ubatch
+        tensor[2][dp_rank] = 1 if should_ubatch else 0
+
+        from vllm.distributed.parallel_state import get_dp_group
+        dist.all_reduce(tensor, group=get_dp_group().device_group)
+
+        result: bool = bool(torch.all(tensor[2] == 1).item())
+        if not result:
+            return result, None
+
+        orig_num_tokens_tensor = tensor[0, :]
+        padded_num_tokens_tensor = tensor[1, :]
+
+        orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
+        padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
+        if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
+            logger.debug("Aborting ubatching %s %s", orig_min_num_tokens,
+                         padded_max_num_tokens)
+            return False, None
+        return result, padded_num_tokens_tensor.cpu()
+
     @staticmethod
     def make(
             parallel_config: ParallelConfig,
@@ -119,14 +167,15 @@ def make(
 
         # If num_tokens_across_dp is None, it will be computed by all_reduce
         # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
-        assert (num_tokens_across_dp is None
-                or num_tokens_across_dp[dp_rank] == batchsize)
+        assert (num_tokens_across_dp is None or num_tokens_across_dp[dp_rank]
+                == batchsize), f"{num_tokens_across_dp[dp_rank]} {batchsize}"
         if num_tokens_across_dp is None:
             num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
                 batchsize, dp_size, dp_rank)
         max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp)
         cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0)
-        return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu)
+        return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu,
+                          num_tokens_across_dp)
 
     @contextmanager
     def chunked_sizes(self, max_chunk_size_per_rank: int, chunk_idx: int):
@@ -179,9 +228,12 @@ class ForwardContext:
     Type AttentionMetadata for v0, 
     Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
     attention layer to its attention metadata
-    set dynamically for each forward pass
+    Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
+    for each microbatch.
+    Set dynamically for each forward pass
     """
-    attn_metadata: Union["AttentionMetadata", dict[str, "AttentionMetadata"]]
+    attn_metadata: Union["AttentionMetadata", dict[str, "AttentionMetadata"],
+                         list[dict[str, "AttentionMetadata"]]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
@@ -191,6 +243,8 @@ class ForwardContext:
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE
     batch_descriptor: Optional[BatchDescriptor] = None
 
+    ubatch_slices: Optional[UBatchSlices] = None
+
     def __post_init__(self):
         assert self.cudagraph_runtime_mode in [
             CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
@@ -208,6 +262,39 @@ def get_forward_context() -> ForwardContext:
     return _forward_context
 
 
+def create_forward_context(
+        attn_metadata: Any,
+        vllm_config: VllmConfig,
+        virtual_engine: int = 0,
+        dp_metadata: Optional[DPMetadata] = None,
+        cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+        batch_descriptor: Optional[BatchDescriptor] = None,
+        ubatch_slices: Optional[UBatchSlices] = None):
+    return ForwardContext(no_compile_layers=vllm_config.compilation_config.
+                          static_forward_context,
+                          virtual_engine=virtual_engine,
+                          attn_metadata=attn_metadata,
+                          dp_metadata=dp_metadata,
+                          cudagraph_runtime_mode=cudagraph_runtime_mode,
+                          batch_descriptor=batch_descriptor,
+                          ubatch_slices=ubatch_slices)
+
+
+@contextmanager
+def override_forward_context(forward_context: Optional[ForwardContext]):
+    """A context manager that overrides the current forward context.
+    This is used to override the forward context for a specific
+    forward pass.
+    """
+    global _forward_context
+    prev_context = _forward_context
+    _forward_context = forward_context
+    try:
+        yield
+    finally:
+        _forward_context = prev_context
+
+
 @contextmanager
 def set_forward_context(
         attn_metadata: Any,
@@ -216,7 +303,8 @@ def set_forward_context(
         num_tokens: Optional[int] = None,
         num_tokens_across_dp: Optional[torch.Tensor] = None,
         cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-        batch_descriptor: Optional[BatchDescriptor] = None):
+        batch_descriptor: Optional[BatchDescriptor] = None,
+        ubatch_slices: Optional[UBatchSlices] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -225,27 +313,22 @@ def set_forward_context(
     need_to_track_batchsize = track_batchsize and attn_metadata is not None
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
+
     dp_metadata: Optional[DPMetadata] = None
     if vllm_config.parallel_config.data_parallel_size > 1 and (
             attn_metadata is not None or num_tokens is not None):
         dp_metadata = DPMetadata.make(vllm_config.parallel_config,
                                       attn_metadata, num_tokens or 0,
                                       num_tokens_across_dp)
 
-    global _forward_context
-    prev_context = _forward_context
-    _forward_context = ForwardContext(
-        no_compile_layers=vllm_config.compilation_config.
-        static_forward_context,
-        virtual_engine=virtual_engine,
-        attn_metadata=attn_metadata,
-        dp_metadata=dp_metadata,
-        cudagraph_runtime_mode=cudagraph_runtime_mode,
-        batch_descriptor=batch_descriptor,
-    )
+    forward_context = create_forward_context(attn_metadata, vllm_config,
+                                             virtual_engine, dp_metadata,
+                                             cudagraph_runtime_mode,
+                                             batch_descriptor, ubatch_slices)
 
     try:
-        yield
+        with override_forward_context(forward_context):
+            yield
     finally:
         global last_logging_time, batchsize_logging_interval
         if need_to_track_batchsize:
@@ -282,5 +365,3 @@ def set_forward_context(
                     logger.info(("Batchsize forward time stats "
                                  "(batchsize, count, median_time(ms)): %s"),
                                 forward_stats)
-
-        _forward_context = prev_context