[TEST] Do not run pytest.mark.xfail

whitneywhtsang · whitneywhtsang · commit c2d6a24e1d41 · 2025-09-24T23:59:21.000Z
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/python/test/gluon/test_consan.py b/python/test/gluon/test_consan.py
@@ -85,7 +85,7 @@ def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr
     tma.store_wait(0)
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer", run=False)
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_async_tma_kernel(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -141,7 +141,7 @@ def tma_interleave_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.cons
     tma.store_wait(0)
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer", run=False)
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_tma_interleave_kernel(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -190,7 +190,7 @@ def async_copy_kernel(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr):
     ampere.async_copy.wait_group(0)
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires ampere or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires ampere or newer", run=False)
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_async_copy(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -252,7 +252,7 @@ def tcgen5_mma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexp
     mbarrier.invalidate(bar.index(1))
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer", run=False)
 @pytest.mark.parametrize("FAILURE", [True, False])
 @pytest.mark.parametrize("MEM_ACCESS_KIND", ["tma_cp", "local_store", "tmem_load", "tmem_store"])
 def test_tcgen5_mma(FAILURE, MEM_ACCESS_KIND, device, run_wrapper):
@@ -305,7 +305,7 @@ def warpgroup_mma_kernel(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr)
     smemA.store(ttgl.full([XBLOCK, XBLOCK], 42, ttgl.float16, blocked_layout))
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper", run=False)
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_warpgroup_mma(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -353,7 +353,7 @@ def warpgroup_mma_kernel2(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr
     smemA.store(ttgl.full([XBLOCK, XBLOCK], 42, ttgl.float16, blocked_layout))
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper", run=False)
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_warpgroup_mma2(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -406,7 +406,7 @@ def tcgen5_mma_multibar_kernel(input_desc, XBLOCK: ttgl.constexpr, BUF_IDX: ttgl
         mbarrier.invalidate(bar.index(i))
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer", run=False)
 @pytest.mark.parametrize("BUF_IDX", [0, 1])
 @pytest.mark.parametrize("BAR_IDX", [0, 1, 2, 3])
 def test_tcgen5_mma_multibar(BUF_IDX, BAR_IDX, device, run_wrapper):
@@ -529,7 +529,7 @@ def multibuffered_loop_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: t
         mbarrier.invalidate(barMMA.index(i))
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer", run=False)
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_multibuffered_loop(FAILURE, device, run_wrapper):
     if run_wrapper:
@@ -611,7 +611,7 @@ def multibuffered_loop_wgmma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE:
         mbarrier.invalidate(barLoadB.index(i))
 
 
-@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
+@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper", run=False)
 @pytest.mark.parametrize("FAILURE", [True, False])
 def test_multibuffered_wgmma_loop(FAILURE, device, run_wrapper):
     if run_wrapper:
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -74,7 +74,7 @@ def tma_kernel(desc):
     alloc._keep_alive()
 
 
-@pytest.mark.xfail(not is_hopper_or_newer(), reason="Requires Hopper")
+@pytest.mark.xfail(not is_hopper_or_newer(), reason="Requires Hopper", run=False)
 def test_tma():
     out = torch.ones((16, 16), dtype=torch.float16, device="cuda")
     layout = ttgl.NVMMASharedLayout(
@@ -113,7 +113,7 @@ def async_copy_mbarrier_kernel(out, inp, xnumel, XBLOCK: ttgl.constexpr, YBLOCK:
     ttgl.store(out + xindex * YBLOCK + yindex, val)
 
 
-@pytest.mark.xfail(not is_ampere_or_newer(), reason="Requires Ampere")
+@pytest.mark.xfail(not is_ampere_or_newer(), reason="Requires Ampere", run=False)
 def test_async_copy_mbarrier(device):
     tensor_opts = dict(dtype=torch.float, device=device)
     out = torch.empty((32, 32), **tensor_opts)
@@ -154,7 +154,7 @@ def warpgroup_mma_kernel(a, b, out, M: ttgl.constexpr, N: ttgl.constexpr, K: ttg
     ttgl.store(out + out_offs_m * N + out_offs_n, acc)
 
 
-@pytest.mark.xfail(not is_hopper(), reason="Requires Hopper")
+@pytest.mark.xfail(not is_hopper(), reason="Requires Hopper", run=False)
 @pytest.mark.parametrize("ASYNC", [True, False])
 def test_warpgroup_mma(ASYNC):
     torch.manual_seed(0)
@@ -169,7 +169,7 @@ def test_warpgroup_mma(ASYNC):
     torch.testing.assert_close(out, ref, atol=1e-3, rtol=1e-1)
 
 
-@pytest.mark.xfail(not is_hip_cdna4(), reason="Requires CDNA4")
+@pytest.mark.xfail(not is_hip_cdna4(), reason="Requires CDNA4", run=False)
 @pytest.mark.parametrize("use_buffer_load", [True, False])
 def test_amd_direct_load_to_shared(use_buffer_load):
 
@@ -205,7 +205,7 @@ def kernel(a_ptr, b_ptr, use_buffer_load: ttgl.constexpr):
     assert 'vmcnt(0)' in pgm.asm['amdgcn']
 
 
-@pytest.mark.xfail(not (is_hip_gfx11() or is_hip_gfx12()), reason="Requires RDNA3 or RDNA4")
+@pytest.mark.xfail(not (is_hip_gfx11() or is_hip_gfx12()), reason="Requires RDNA3 or RDNA4", run=False)
 @pytest.mark.parametrize("M, N, K", [(64, 64, 64)])
 @pytest.mark.parametrize("in_dtype", ['float16', 'bfloat16'])
 def test_amd_wmma(M, N, K, in_dtype):
@@ -331,7 +331,7 @@ def kernel(a_ptr, b_ptr, c_ptr, stride_am, stride_ak,  #
     torch.testing.assert_close(ref, triton_output)
 
 
-@pytest.mark.xfail(not is_hip_cdna4(), reason="Requires CDNA4")
+@pytest.mark.xfail(not is_hip_cdna4(), reason="Requires CDNA4", run=False)
 @pytest.mark.parametrize("M, N, K, rhs_scale, mxfp_type, normal_type", [(32, 32, 128, rhs_scale, mxfp_type, normal_type)
                                                                         for rhs_scale in [True, False]
                                                                         for mxfp_type in ["e2m1"]
@@ -517,7 +517,7 @@ def fast_dividef_kernel(x_ptr, y_ptr, z_ptr, warp_size: ttgl.constexpr, num_warp
 
 
 @pytest.mark.xfail(reason="copy to tmem with scale layout is currently broken in Gluon.")
-@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell")
+@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
 def test_tmem_copy_2d():
     device = "cuda"
 
@@ -566,7 +566,7 @@ def kernel(in_ptr, out_ptr, smem_h: ttgl.constexpr, smem_w: ttgl.constexpr, num_
             assert torch.equal(x[m * 32:(m + 1) * 32], z_tri[32 * i:32 * (i + 1), col_offset:(col_offset + 4)])
 
 
-@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell")
+@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
 def test_tmem_subslice_block_m_64():
 
     @gluon.jit
@@ -646,7 +646,7 @@ def kernel(s_ptr, out_ptr):
     torch.testing.assert_close(out_ref, out_tri, atol=0, rtol=0)
 
 
-@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell")
+@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
 def test_block_m_64_mma():
 
     @gluon.jit
@@ -768,7 +768,7 @@ def kernel(in_ptr, out_ptr):
     torch.testing.assert_close(input, output, atol=0, rtol=0)
 
 
-@pytest.mark.xfail(not is_hopper_or_newer(), reason="Requires Hopper")
+@pytest.mark.xfail(not is_hopper_or_newer(), reason="Requires Hopper", run=False)
 def test_tma_slice():
     XBLOCK = YBLOCK = ttgl.constexpr(128)
 
@@ -805,7 +805,7 @@ def kernel(in_desc, out_desc):
 @pytest.mark.parametrize("swizzle", [32, 64, 128])
 @pytest.mark.parametrize("num_warps", [4, 8])
 @pytest.mark.parametrize("M, N, BLOCK_N", [(128, 128, 128), (256, 128, 64), (128, 128, 16)])
-@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell")
+@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
 def test_tmem_copy_no_scales(M, N, BLOCK_N, num_warps, swizzle):
 
     @gluon.jit
@@ -879,7 +879,7 @@ def kernel(N, out):
     assert compiled_kernel.asm["llir"].count("define") == 1
 
 
-@pytest.mark.xfail(not is_hip_cdna3() and not is_hip_cdna4(), reason="Requires CDNA3 or CDNA4")
+@pytest.mark.xfail(not is_hip_cdna3() and not is_hip_cdna4(), reason="Requires CDNA3 or CDNA4", run=False)
 def test_inline_with_amdgpu_dialect():
 
     @gluon.jit
diff --git a/python/test/unit/intel/test_block_load.py b/python/test/unit/intel/test_block_load.py
@@ -14,7 +14,7 @@
 @pytest.mark.parametrize("transpose", [True, False])
 @pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend")
 @pytest.mark.xfail(not torch.xpu.get_device_capability()['has_subgroup_2d_block_io'],
-                   reason="Block loads not supported on this architecture")
+                   reason="Block loads not supported on this architecture", run=False)
 def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pathlib.Path):
     # modify the layouts to ensure the correct OCL/SPIRV intrinsic is called for each datatype
     if dtype_str == "int8":
@@ -92,7 +92,7 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
 @pytest.mark.xfail(
     not (torch.xpu.get_device_capability()['has_subgroup_2d_block_io']
          and torch.xpu.get_device_capability()['has_subgroup_matrix_multiply_accumulate']),
-    reason="Block loads and/or DPAS not supported on this architecture")
+    reason="Block loads and/or DPAS not supported on this architecture", run=False)
 def test_block_load_dot_product(BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, TRANSPOSE_A, TRANSPOSE_B,
                                 device):
     if GROUP_SIZE_M == 1 and (BLOCK_SIZE_M > 64 or BLOCK_SIZE_N > 64):
diff --git a/python/test/unit/runtime/test_autotuner.py b/python/test/unit/runtime/test_autotuner.py
@@ -174,7 +174,7 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
 
 
 @pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9,
-                   reason="Requires compute capability >= 9 for NV")
+                   reason="Requires compute capability >= 9 for NV", run=False)
 def test_override_ttir(device):
     N = 1024
     src = torch.randn(N, device=device)
@@ -223,7 +223,7 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
 
 
 @pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9,
-                   reason="Requires compute capability >= 9 for NV")
+                   reason="Requires compute capability >= 9 for NV", run=False)
 def test_override_ttgir(device):
     N = 1024
     src = torch.randn(N, device=device)
@@ -273,7 +273,7 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
 
 
 @pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9,
-                   reason="PTX file in this unit test is only for SM90")
+                   reason="PTX file in this unit test is only for SM90", run=False)
 def test_override_ptx(device):
     N = 1024
     src = torch.randn(N, device=device)
diff --git a/python/triton_kernels/tests/test_tensor_details/test_layout_blackwell.py b/python/triton_kernels/tests/test_tensor_details/test_layout_blackwell.py
@@ -18,7 +18,7 @@
         (3, 2, 36),
     ],
 )
-@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA")
+@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA", run=False)
 def test_mxfp4_scale_roundtrip(shape):
     x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
     layout = BlackwellMXScaleLayout(x.shape)
diff --git a/python/triton_kernels/tests/test_tensor_details/test_layout_hopper.py b/python/triton_kernels/tests/test_tensor_details/test_layout_hopper.py
@@ -19,7 +19,7 @@
 @pytest.mark.parametrize("trans", [False, True])
 @pytest.mark.parametrize("mx_axis", [0, 1])
 @pytest.mark.parametrize("mma_version", [2, 3])
-@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA")
+@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA", run=False)
 def test_mxfp4_value_roundtrip(shape, trans, mx_axis, mma_version):
     x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
     if trans:
@@ -34,7 +34,7 @@ def test_mxfp4_value_roundtrip(shape, trans, mx_axis, mma_version):
 @pytest.mark.parametrize("mx_axis", [0, 1])
 @pytest.mark.parametrize("num_warps", [4, 8])
 @pytest.mark.parametrize("shape", [(256, 64), (256, 128), (256, 256)])
-@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA")
+@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA", run=False)
 def test_mxfp4_scale_roundtrip(shape, mx_axis, num_warps):
     x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
     layout = HopperMXScaleLayout(x.shape, mx_axis=mx_axis, num_warps=num_warps)
@@ -73,7 +73,7 @@ def _upcast_mxfp4_to_bf16(Y, X, XScale, x_stride_m, x_stride_n, x_scale_stride_m
     tl.store(Y + offs_y, y)
 
 
-@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on cuda")
+@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on cuda", run=False)
 @pytest.mark.skipif(not is_cuda() and not is_xpu(), reason="Only supported on cuda")
 @pytest.mark.skipif(is_cuda() and not cuda_capability_geq(9), reason="Only supported for capability >= 9")
 def test_upcast_mxfp4_to_bf16():
diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh
@@ -404,7 +404,7 @@ run_gluon_tests() {
   cd $TRITON_PROJ/python/test/gluon
 
   TRITON_TEST_SUITE=gluon \
-    run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu . -m "not xfail"
+    run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu .
 }
 
 run_interpreter_tests() {

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`(3, 2, 36),`
`19`	`19`	`],`
`20`	`20`	`)`
`21`		`-@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA")`
	`21`	`+@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA", run=False)`
`22`	`22`	`def test_mxfp4_scale_roundtrip(shape):`
`23`	`23`	`x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")`
`24`	`24`	`layout = BlackwellMXScaleLayout(x.shape)`
Original file line number	Diff line number	Diff line change
`@@ -404,7 +404,7 @@ run_gluon_tests() {`
`404`	`404`	`cd $TRITON_PROJ/python/test/gluon`
`405`	`405`
`406`	`406`	`TRITON_TEST_SUITE=gluon \`
`407`		`- run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu . -m "not xfail"`
	`407`	`+ run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu .`
`408`	`408`	`}`
`409`	`409`
`410`	`410`	`run_interpreter_tests() {`