nit

BoyuanFeng · BoyuanFeng · commit 8e7c62a87c47 · 2025-10-12T21:33:45.000-07:00
Signed-off-by: Boyuan Feng &lt;boyuan@meta.com&gt;
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -111,6 +111,13 @@ def maybe_get_vit_flash_attn_backend(
     return attn_backend, flash_attn_varlen_func
 
 
+def allocate_tensor(shape: torch.Size, device: torch.device, dtype: torch.dtype):
+    if get_current_vllm_config().model_config.init_attn_out:
+        return torch.zeros(shape, device=device, dtype=dtype)
+    else:
+        return torch.empty(shape, device=device, dtype=dtype)
+
+
 class Attention(nn.Module, AttentionLayerBase):
     """Attention layer.
 
@@ -349,8 +356,8 @@ def forward(
 
             # Use torch.empty to avoid initializing tensor with zero.
             output_numel = output_shape.numel()
-            output_shape = (output_numel//(self.num_heads * self.head_size), self.num_heads, self.head_size)
-            output = torch.empty(output_shape, dtype=output_dtype, device=query.device)
+            output_shape = torch.Size((output_numel//(self.num_heads * self.head_size), self.num_heads, self.head_size))
+            output = allocate_tensor(output_shape, device=query.device, dtype=output_dtype)
 
             # Reshape the query, key, and value tensors.
             # NOTE(woosuk): We do this outside the custom op to minimize the
@@ -708,7 +715,7 @@ def forward(
                 self.calc_kv_scales(q, kv_c_normed, k_pe)
 
             if self.attn_backend.accept_output_buffer:
-                output = torch.zeros(output_shape, dtype=q.dtype, device=q.device)
+                output = allocate_tensor(output_shape, dtype=q.dtype, device=q.device)
                 self.impl.forward(
                     self,
                     q,
@@ -725,7 +732,7 @@ def forward(
                 )
         else:
             if self.attn_backend.accept_output_buffer:
-                output = torch.zeros(output_shape, dtype=q.dtype, device=q.device)
+                output = allocate_tensor(output_shape, dtype=q.dtype, device=q.device)
                 torch.ops.vllm.unified_mla_attention_with_output(
                     q,
                     kv_c_normed,
diff --git a/vllm/config/model.py b/vllm/config/model.py
@@ -187,6 +187,7 @@ class ModelConfig:
     - 1k -> 1000\n
     - 1K -> 1024\n
     - 25.6k -> 25,600"""
+    init_attn_out: bool = False
     spec_target_max_model_len: Optional[int] = None
     """Specify the maximum length for spec decoding draft models."""
     quantization: SkipValidation[Optional[QuantizationMethods]] = None