Skip to content

Commit c2d6a24

Browse files
[TEST] Do not run pytest.mark.xfail
Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
1 parent ecf2442 commit c2d6a24

File tree

7 files changed

+31
-31
lines changed

7 files changed

+31
-31
lines changed

python/test/gluon/test_consan.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr
8585
tma.store_wait(0)
8686

8787

88-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer")
88+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer", run=False)
8989
@pytest.mark.parametrize("FAILURE", [True, False])
9090
def test_async_tma_kernel(FAILURE, device, run_wrapper):
9191
if run_wrapper:
@@ -141,7 +141,7 @@ def tma_interleave_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.cons
141141
tma.store_wait(0)
142142

143143

144-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer")
144+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires hopper or newer", run=False)
145145
@pytest.mark.parametrize("FAILURE", [True, False])
146146
def test_tma_interleave_kernel(FAILURE, device, run_wrapper):
147147
if run_wrapper:
@@ -190,7 +190,7 @@ def async_copy_kernel(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr):
190190
ampere.async_copy.wait_group(0)
191191

192192

193-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires ampere or newer")
193+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9, reason="Requires ampere or newer", run=False)
194194
@pytest.mark.parametrize("FAILURE", [True, False])
195195
def test_async_copy(FAILURE, device, run_wrapper):
196196
if run_wrapper:
@@ -252,7 +252,7 @@ def tcgen5_mma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexp
252252
mbarrier.invalidate(bar.index(1))
253253

254254

255-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
255+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer", run=False)
256256
@pytest.mark.parametrize("FAILURE", [True, False])
257257
@pytest.mark.parametrize("MEM_ACCESS_KIND", ["tma_cp", "local_store", "tmem_load", "tmem_store"])
258258
def test_tcgen5_mma(FAILURE, MEM_ACCESS_KIND, device, run_wrapper):
@@ -305,7 +305,7 @@ def warpgroup_mma_kernel(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr)
305305
smemA.store(ttgl.full([XBLOCK, XBLOCK], 42, ttgl.float16, blocked_layout))
306306

307307

308-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
308+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper", run=False)
309309
@pytest.mark.parametrize("FAILURE", [True, False])
310310
def test_warpgroup_mma(FAILURE, device, run_wrapper):
311311
if run_wrapper:
@@ -353,7 +353,7 @@ def warpgroup_mma_kernel2(input, XBLOCK: ttgl.constexpr, FAILURE: ttgl.constexpr
353353
smemA.store(ttgl.full([XBLOCK, XBLOCK], 42, ttgl.float16, blocked_layout))
354354

355355

356-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
356+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper", run=False)
357357
@pytest.mark.parametrize("FAILURE", [True, False])
358358
def test_warpgroup_mma2(FAILURE, device, run_wrapper):
359359
if run_wrapper:
@@ -406,7 +406,7 @@ def tcgen5_mma_multibar_kernel(input_desc, XBLOCK: ttgl.constexpr, BUF_IDX: ttgl
406406
mbarrier.invalidate(bar.index(i))
407407

408408

409-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
409+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer", run=False)
410410
@pytest.mark.parametrize("BUF_IDX", [0, 1])
411411
@pytest.mark.parametrize("BAR_IDX", [0, 1, 2, 3])
412412
def test_tcgen5_mma_multibar(BUF_IDX, BAR_IDX, device, run_wrapper):
@@ -529,7 +529,7 @@ def multibuffered_loop_tma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE: t
529529
mbarrier.invalidate(barMMA.index(i))
530530

531531

532-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer")
532+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 10, reason="Requires blackwell or newer", run=False)
533533
@pytest.mark.parametrize("FAILURE", [True, False])
534534
def test_multibuffered_loop(FAILURE, device, run_wrapper):
535535
if run_wrapper:
@@ -611,7 +611,7 @@ def multibuffered_loop_wgmma_kernel(input_desc, XBLOCK: ttgl.constexpr, FAILURE:
611611
mbarrier.invalidate(barLoadB.index(i))
612612

613613

614-
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper")
614+
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9, reason="Requires hopper", run=False)
615615
@pytest.mark.parametrize("FAILURE", [True, False])
616616
def test_multibuffered_wgmma_loop(FAILURE, device, run_wrapper):
617617
if run_wrapper:

python/test/gluon/test_core.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def tma_kernel(desc):
7474
alloc._keep_alive()
7575

7676

77-
@pytest.mark.xfail(not is_hopper_or_newer(), reason="Requires Hopper")
77+
@pytest.mark.xfail(not is_hopper_or_newer(), reason="Requires Hopper", run=False)
7878
def test_tma():
7979
out = torch.ones((16, 16), dtype=torch.float16, device="cuda")
8080
layout = ttgl.NVMMASharedLayout(
@@ -113,7 +113,7 @@ def async_copy_mbarrier_kernel(out, inp, xnumel, XBLOCK: ttgl.constexpr, YBLOCK:
113113
ttgl.store(out + xindex * YBLOCK + yindex, val)
114114

115115

116-
@pytest.mark.xfail(not is_ampere_or_newer(), reason="Requires Ampere")
116+
@pytest.mark.xfail(not is_ampere_or_newer(), reason="Requires Ampere", run=False)
117117
def test_async_copy_mbarrier(device):
118118
tensor_opts = dict(dtype=torch.float, device=device)
119119
out = torch.empty((32, 32), **tensor_opts)
@@ -154,7 +154,7 @@ def warpgroup_mma_kernel(a, b, out, M: ttgl.constexpr, N: ttgl.constexpr, K: ttg
154154
ttgl.store(out + out_offs_m * N + out_offs_n, acc)
155155

156156

157-
@pytest.mark.xfail(not is_hopper(), reason="Requires Hopper")
157+
@pytest.mark.xfail(not is_hopper(), reason="Requires Hopper", run=False)
158158
@pytest.mark.parametrize("ASYNC", [True, False])
159159
def test_warpgroup_mma(ASYNC):
160160
torch.manual_seed(0)
@@ -169,7 +169,7 @@ def test_warpgroup_mma(ASYNC):
169169
torch.testing.assert_close(out, ref, atol=1e-3, rtol=1e-1)
170170

171171

172-
@pytest.mark.xfail(not is_hip_cdna4(), reason="Requires CDNA4")
172+
@pytest.mark.xfail(not is_hip_cdna4(), reason="Requires CDNA4", run=False)
173173
@pytest.mark.parametrize("use_buffer_load", [True, False])
174174
def test_amd_direct_load_to_shared(use_buffer_load):
175175

@@ -205,7 +205,7 @@ def kernel(a_ptr, b_ptr, use_buffer_load: ttgl.constexpr):
205205
assert 'vmcnt(0)' in pgm.asm['amdgcn']
206206

207207

208-
@pytest.mark.xfail(not (is_hip_gfx11() or is_hip_gfx12()), reason="Requires RDNA3 or RDNA4")
208+
@pytest.mark.xfail(not (is_hip_gfx11() or is_hip_gfx12()), reason="Requires RDNA3 or RDNA4", run=False)
209209
@pytest.mark.parametrize("M, N, K", [(64, 64, 64)])
210210
@pytest.mark.parametrize("in_dtype", ['float16', 'bfloat16'])
211211
def test_amd_wmma(M, N, K, in_dtype):
@@ -331,7 +331,7 @@ def kernel(a_ptr, b_ptr, c_ptr, stride_am, stride_ak, #
331331
torch.testing.assert_close(ref, triton_output)
332332

333333

334-
@pytest.mark.xfail(not is_hip_cdna4(), reason="Requires CDNA4")
334+
@pytest.mark.xfail(not is_hip_cdna4(), reason="Requires CDNA4", run=False)
335335
@pytest.mark.parametrize("M, N, K, rhs_scale, mxfp_type, normal_type", [(32, 32, 128, rhs_scale, mxfp_type, normal_type)
336336
for rhs_scale in [True, False]
337337
for mxfp_type in ["e2m1"]
@@ -517,7 +517,7 @@ def fast_dividef_kernel(x_ptr, y_ptr, z_ptr, warp_size: ttgl.constexpr, num_warp
517517

518518

519519
@pytest.mark.xfail(reason="copy to tmem with scale layout is currently broken in Gluon.")
520-
@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell")
520+
@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
521521
def test_tmem_copy_2d():
522522
device = "cuda"
523523

@@ -566,7 +566,7 @@ def kernel(in_ptr, out_ptr, smem_h: ttgl.constexpr, smem_w: ttgl.constexpr, num_
566566
assert torch.equal(x[m * 32:(m + 1) * 32], z_tri[32 * i:32 * (i + 1), col_offset:(col_offset + 4)])
567567

568568

569-
@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell")
569+
@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
570570
def test_tmem_subslice_block_m_64():
571571

572572
@gluon.jit
@@ -646,7 +646,7 @@ def kernel(s_ptr, out_ptr):
646646
torch.testing.assert_close(out_ref, out_tri, atol=0, rtol=0)
647647

648648

649-
@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell")
649+
@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
650650
def test_block_m_64_mma():
651651

652652
@gluon.jit
@@ -768,7 +768,7 @@ def kernel(in_ptr, out_ptr):
768768
torch.testing.assert_close(input, output, atol=0, rtol=0)
769769

770770

771-
@pytest.mark.xfail(not is_hopper_or_newer(), reason="Requires Hopper")
771+
@pytest.mark.xfail(not is_hopper_or_newer(), reason="Requires Hopper", run=False)
772772
def test_tma_slice():
773773
XBLOCK = YBLOCK = ttgl.constexpr(128)
774774

@@ -805,7 +805,7 @@ def kernel(in_desc, out_desc):
805805
@pytest.mark.parametrize("swizzle", [32, 64, 128])
806806
@pytest.mark.parametrize("num_warps", [4, 8])
807807
@pytest.mark.parametrize("M, N, BLOCK_N", [(128, 128, 128), (256, 128, 64), (128, 128, 16)])
808-
@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell")
808+
@pytest.mark.xfail(not is_blackwell(), reason="Requires Blackwell", run=False)
809809
def test_tmem_copy_no_scales(M, N, BLOCK_N, num_warps, swizzle):
810810

811811
@gluon.jit
@@ -879,7 +879,7 @@ def kernel(N, out):
879879
assert compiled_kernel.asm["llir"].count("define") == 1
880880

881881

882-
@pytest.mark.xfail(not is_hip_cdna3() and not is_hip_cdna4(), reason="Requires CDNA3 or CDNA4")
882+
@pytest.mark.xfail(not is_hip_cdna3() and not is_hip_cdna4(), reason="Requires CDNA3 or CDNA4", run=False)
883883
def test_inline_with_amdgpu_dialect():
884884

885885
@gluon.jit

python/test/unit/intel/test_block_load.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
@pytest.mark.parametrize("transpose", [True, False])
1515
@pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend")
1616
@pytest.mark.xfail(not torch.xpu.get_device_capability()['has_subgroup_2d_block_io'],
17-
reason="Block loads not supported on this architecture")
17+
reason="Block loads not supported on this architecture", run=False)
1818
def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pathlib.Path):
1919
# modify the layouts to ensure the correct OCL/SPIRV intrinsic is called for each datatype
2020
if dtype_str == "int8":
@@ -92,7 +92,7 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
9292
@pytest.mark.xfail(
9393
not (torch.xpu.get_device_capability()['has_subgroup_2d_block_io']
9494
and torch.xpu.get_device_capability()['has_subgroup_matrix_multiply_accumulate']),
95-
reason="Block loads and/or DPAS not supported on this architecture")
95+
reason="Block loads and/or DPAS not supported on this architecture", run=False)
9696
def test_block_load_dot_product(BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, TRANSPOSE_A, TRANSPOSE_B,
9797
device):
9898
if GROUP_SIZE_M == 1 and (BLOCK_SIZE_M > 64 or BLOCK_SIZE_N > 64):

python/test/unit/runtime/test_autotuner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
174174

175175

176176
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9,
177-
reason="Requires compute capability >= 9 for NV")
177+
reason="Requires compute capability >= 9 for NV", run=False)
178178
def test_override_ttir(device):
179179
N = 1024
180180
src = torch.randn(N, device=device)
@@ -223,7 +223,7 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
223223

224224

225225
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] < 9,
226-
reason="Requires compute capability >= 9 for NV")
226+
reason="Requires compute capability >= 9 for NV", run=False)
227227
def test_override_ttgir(device):
228228
N = 1024
229229
src = torch.randn(N, device=device)
@@ -273,7 +273,7 @@ def _kernel(dst, src, N, BLOCK_SIZE: tl.constexpr):
273273

274274

275275
@pytest.mark.xfail(not is_cuda() or torch.cuda.get_device_capability()[0] != 9,
276-
reason="PTX file in this unit test is only for SM90")
276+
reason="PTX file in this unit test is only for SM90", run=False)
277277
def test_override_ptx(device):
278278
N = 1024
279279
src = torch.randn(N, device=device)

python/triton_kernels/tests/test_tensor_details/test_layout_blackwell.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
(3, 2, 36),
1919
],
2020
)
21-
@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA")
21+
@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA", run=False)
2222
def test_mxfp4_scale_roundtrip(shape):
2323
x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
2424
layout = BlackwellMXScaleLayout(x.shape)

python/triton_kernels/tests/test_tensor_details/test_layout_hopper.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
@pytest.mark.parametrize("trans", [False, True])
2020
@pytest.mark.parametrize("mx_axis", [0, 1])
2121
@pytest.mark.parametrize("mma_version", [2, 3])
22-
@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA")
22+
@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA", run=False)
2323
def test_mxfp4_value_roundtrip(shape, trans, mx_axis, mma_version):
2424
x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
2525
if trans:
@@ -34,7 +34,7 @@ def test_mxfp4_value_roundtrip(shape, trans, mx_axis, mma_version):
3434
@pytest.mark.parametrize("mx_axis", [0, 1])
3535
@pytest.mark.parametrize("num_warps", [4, 8])
3636
@pytest.mark.parametrize("shape", [(256, 64), (256, 128), (256, 256)])
37-
@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA")
37+
@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on CUDA", run=False)
3838
def test_mxfp4_scale_roundtrip(shape, mx_axis, num_warps):
3939
x = torch.randint(0, 256, shape, dtype=torch.uint8, device="cuda")
4040
layout = HopperMXScaleLayout(x.shape, mx_axis=mx_axis, num_warps=num_warps)
@@ -73,7 +73,7 @@ def _upcast_mxfp4_to_bf16(Y, X, XScale, x_stride_m, x_stride_n, x_scale_stride_m
7373
tl.store(Y + offs_y, y)
7474

7575

76-
@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on cuda")
76+
@pytest.mark.xfail(condition=not is_cuda(), reason="Only supported on cuda", run=False)
7777
@pytest.mark.skipif(not is_cuda() and not is_xpu(), reason="Only supported on cuda")
7878
@pytest.mark.skipif(is_cuda() and not cuda_capability_geq(9), reason="Only supported for capability >= 9")
7979
def test_upcast_mxfp4_to_bf16():

scripts/test-triton.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ run_gluon_tests() {
404404
cd $TRITON_PROJ/python/test/gluon
405405

406406
TRITON_TEST_SUITE=gluon \
407-
run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu . -m "not xfail"
407+
run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu .
408408
}
409409

410410
run_interpreter_tests() {

0 commit comments

Comments
 (0)