[Kernel] Apply torch.Tag.needs_fixed_stride_order only for torch==2.6.0

zou3519 · zou3519 · commit 33aaa159a080 · 2025-06-20T08:05:41.000-07:00
Summary: In torch 2.6.0, torch accidentally changed the default for custom operators to be "requires_contiguous". As a workaround, vLLM added needs_fixed_stride_order to a large number of custom operators. vLLM is currently on torch 2.7.0 which has reverted the default for custom operators back to needs_fixed_stride_order. This PR cleans up the kernel logic by flipping the default back. The other reason why I want to flip the default back is that needs_fixed_stride_order is actually buggy and torch 2.8.0 has better behavior for custom operators with no layout tags set. Also Kaichao tells me that some backends may not have moved to PyTorch 2.7.0 yet (vllm-project#8932) so I didn't delete the code in this PR. Test Plan: - Existing tests - Ran `pytest tests/compile/test_full_graph.py` (this was the test that originally caused us to add the needs_fixed_stride_order tag, see vllm-project#12721 for context) Signed-off-by: rzou <zou3519@gmail.com>
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -20,13 +20,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
   //
 
-  // The default behavior in PyTorch 2.6 is "requires_contiguous", so we need
+  // The default behavior in PyTorch 2.6 was changed to "requires_contiguous",
+  // so we need
   // to override this for many GEMMs with the following tag. Otherwise,
   // torch.compile will force all input tensors to be contiguous(), which
   // will break many custom ops that require column-major weight matrices.
-  // TODO: remove this for PyTorch 2.8, when the default is planned to switch
-  // to match exact eager-mode strides.
-  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
+  // This was a bug and PyTorch 2.7 has since fixed this.
+#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
+  #define stride_tag at::Tag::needs_fixed_stride_order
+#else
+  #define stride_tag
+#endif
 
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
@@ -6,7 +6,7 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
 
 
 def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
@@ -93,8 +93,12 @@ def mla_decode_fwd_fake(
 
 
 if current_platform.is_rocm():
+    if is_torch_equal_or_newer("2.7.0"):
+        tags = ()
+    else:
+        tags = (torch.Tag.needs_fixed_stride_order, ),
     direct_register_custom_op(op_name="rocm_aiter_mla_decode_fwd",
                               op_func=mla_decode_fwd_impl,
                               mutates_args=["o"],
                               fake_impl=mla_decode_fwd_fake,
-                              tags=[torch.Tag.needs_fixed_stride_order])
+                              tags=tags)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -22,7 +22,7 @@
     _resize_cache, moe_kernel_quantize_input)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
@@ -1053,7 +1053,8 @@ def inplace_fused_experts_fake(
     op_func=inplace_fused_experts,
     mutates_args=["hidden_states"],
     fake_impl=inplace_fused_experts_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(() if is_torch_equal_or_newer("2.7.0") else
+          (torch.Tag.needs_fixed_stride_order, )),
 )
 
 
@@ -1117,7 +1118,8 @@ def outplace_fused_experts_fake(
     op_func=outplace_fused_experts,
     mutates_args=[],
     fake_impl=outplace_fused_experts_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(() if is_torch_equal_or_newer("2.7.0") else
+          (torch.Tag.needs_fixed_stride_order, )),
 )