remove attn output view kernel

BoyuanFeng · BoyuanFeng · commit c41d02f81c5b · 2025-10-12T21:33:45.000-07:00
Signed-off-by: Boyuan Feng &lt;boyuan@meta.com&gt;
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -345,13 +345,17 @@ def forward(
 
         if self.use_output:
             output_shape = output_shape if output_shape is not None else query.shape
-            output = torch.zeros(output_shape, dtype=output_dtype, device=query.device)
             hidden_size = output_shape[-1]
+
+            # Use torch.empty to avoid initializing tensor with zero.
+            output_numel = output_shape.numel()
+            output_shape = (output_numel//(self.num_heads * self.head_size), self.num_heads, self.head_size)
+            output = torch.empty(output_shape, dtype=output_dtype, device=query.device)
+
             # Reshape the query, key, and value tensors.
             # NOTE(woosuk): We do this outside the custom op to minimize the
             # CPU overheads from the non-CUDA-graph regions.
             query = query.view(-1, self.num_heads, self.head_size)
-            output = output.view(-1, self.num_heads, self.head_size)
             if key is not None:
                 key = key.view(-1, self.num_kv_heads, self.head_size)
             if value is not None: