@@ -74,7 +74,7 @@ def tma_kernel(desc):
74
74
alloc ._keep_alive ()
75
75
76
76
77
- @pytest .mark .xfail (not is_hopper_or_newer (), reason = "Requires Hopper" )
77
+ @pytest .mark .xfail (not is_hopper_or_newer (), reason = "Requires Hopper" , run = False )
78
78
def test_tma ():
79
79
out = torch .ones ((16 , 16 ), dtype = torch .float16 , device = "cuda" )
80
80
layout = ttgl .NVMMASharedLayout (
@@ -113,7 +113,7 @@ def async_copy_mbarrier_kernel(out, inp, xnumel, XBLOCK: ttgl.constexpr, YBLOCK:
113
113
ttgl .store (out + xindex * YBLOCK + yindex , val )
114
114
115
115
116
- @pytest .mark .xfail (not is_ampere_or_newer (), reason = "Requires Ampere" )
116
+ @pytest .mark .xfail (not is_ampere_or_newer (), reason = "Requires Ampere" , run = False )
117
117
def test_async_copy_mbarrier (device ):
118
118
tensor_opts = dict (dtype = torch .float , device = device )
119
119
out = torch .empty ((32 , 32 ), ** tensor_opts )
@@ -154,7 +154,7 @@ def warpgroup_mma_kernel(a, b, out, M: ttgl.constexpr, N: ttgl.constexpr, K: ttg
154
154
ttgl .store (out + out_offs_m * N + out_offs_n , acc )
155
155
156
156
157
- @pytest .mark .xfail (not is_hopper (), reason = "Requires Hopper" )
157
+ @pytest .mark .xfail (not is_hopper (), reason = "Requires Hopper" , run = False )
158
158
@pytest .mark .parametrize ("ASYNC" , [True , False ])
159
159
def test_warpgroup_mma (ASYNC ):
160
160
torch .manual_seed (0 )
@@ -169,7 +169,7 @@ def test_warpgroup_mma(ASYNC):
169
169
torch .testing .assert_close (out , ref , atol = 1e-3 , rtol = 1e-1 )
170
170
171
171
172
- @pytest .mark .xfail (not is_hip_cdna4 (), reason = "Requires CDNA4" )
172
+ @pytest .mark .xfail (not is_hip_cdna4 (), reason = "Requires CDNA4" , run = False )
173
173
@pytest .mark .parametrize ("use_buffer_load" , [True , False ])
174
174
def test_amd_direct_load_to_shared (use_buffer_load ):
175
175
@@ -205,7 +205,7 @@ def kernel(a_ptr, b_ptr, use_buffer_load: ttgl.constexpr):
205
205
assert 'vmcnt(0)' in pgm .asm ['amdgcn' ]
206
206
207
207
208
- @pytest .mark .xfail (not (is_hip_gfx11 () or is_hip_gfx12 ()), reason = "Requires RDNA3 or RDNA4" )
208
+ @pytest .mark .xfail (not (is_hip_gfx11 () or is_hip_gfx12 ()), reason = "Requires RDNA3 or RDNA4" , run = False )
209
209
@pytest .mark .parametrize ("M, N, K" , [(64 , 64 , 64 )])
210
210
@pytest .mark .parametrize ("in_dtype" , ['float16' , 'bfloat16' ])
211
211
def test_amd_wmma (M , N , K , in_dtype ):
@@ -331,7 +331,7 @@ def kernel(a_ptr, b_ptr, c_ptr, stride_am, stride_ak, #
331
331
torch .testing .assert_close (ref , triton_output )
332
332
333
333
334
- @pytest .mark .xfail (not is_hip_cdna4 (), reason = "Requires CDNA4" )
334
+ @pytest .mark .xfail (not is_hip_cdna4 (), reason = "Requires CDNA4" , run = False )
335
335
@pytest .mark .parametrize ("M, N, K, rhs_scale, mxfp_type, normal_type" , [(32 , 32 , 128 , rhs_scale , mxfp_type , normal_type )
336
336
for rhs_scale in [True , False ]
337
337
for mxfp_type in ["e2m1" ]
@@ -517,7 +517,7 @@ def fast_dividef_kernel(x_ptr, y_ptr, z_ptr, warp_size: ttgl.constexpr, num_warp
517
517
518
518
519
519
@pytest .mark .xfail (reason = "copy to tmem with scale layout is currently broken in Gluon." )
520
- @pytest .mark .xfail (not is_blackwell (), reason = "Requires Blackwell" )
520
+ @pytest .mark .xfail (not is_blackwell (), reason = "Requires Blackwell" , run = False )
521
521
def test_tmem_copy_2d ():
522
522
device = "cuda"
523
523
@@ -566,7 +566,7 @@ def kernel(in_ptr, out_ptr, smem_h: ttgl.constexpr, smem_w: ttgl.constexpr, num_
566
566
assert torch .equal (x [m * 32 :(m + 1 ) * 32 ], z_tri [32 * i :32 * (i + 1 ), col_offset :(col_offset + 4 )])
567
567
568
568
569
- @pytest .mark .xfail (not is_blackwell (), reason = "Requires Blackwell" )
569
+ @pytest .mark .xfail (not is_blackwell (), reason = "Requires Blackwell" , run = False )
570
570
def test_tmem_subslice_block_m_64 ():
571
571
572
572
@gluon .jit
@@ -646,7 +646,7 @@ def kernel(s_ptr, out_ptr):
646
646
torch .testing .assert_close (out_ref , out_tri , atol = 0 , rtol = 0 )
647
647
648
648
649
- @pytest .mark .xfail (not is_blackwell (), reason = "Requires Blackwell" )
649
+ @pytest .mark .xfail (not is_blackwell (), reason = "Requires Blackwell" , run = False )
650
650
def test_block_m_64_mma ():
651
651
652
652
@gluon .jit
@@ -768,7 +768,7 @@ def kernel(in_ptr, out_ptr):
768
768
torch .testing .assert_close (input , output , atol = 0 , rtol = 0 )
769
769
770
770
771
- @pytest .mark .xfail (not is_hopper_or_newer (), reason = "Requires Hopper" )
771
+ @pytest .mark .xfail (not is_hopper_or_newer (), reason = "Requires Hopper" , run = False )
772
772
def test_tma_slice ():
773
773
XBLOCK = YBLOCK = ttgl .constexpr (128 )
774
774
@@ -805,7 +805,7 @@ def kernel(in_desc, out_desc):
805
805
@pytest .mark .parametrize ("swizzle" , [32 , 64 , 128 ])
806
806
@pytest .mark .parametrize ("num_warps" , [4 , 8 ])
807
807
@pytest .mark .parametrize ("M, N, BLOCK_N" , [(128 , 128 , 128 ), (256 , 128 , 64 ), (128 , 128 , 16 )])
808
- @pytest .mark .xfail (not is_blackwell (), reason = "Requires Blackwell" )
808
+ @pytest .mark .xfail (not is_blackwell (), reason = "Requires Blackwell" , run = False )
809
809
def test_tmem_copy_no_scales (M , N , BLOCK_N , num_warps , swizzle ):
810
810
811
811
@gluon .jit
@@ -879,7 +879,7 @@ def kernel(N, out):
879
879
assert compiled_kernel .asm ["llir" ].count ("define" ) == 1
880
880
881
881
882
- @pytest .mark .xfail (not is_hip_cdna3 () and not is_hip_cdna4 (), reason = "Requires CDNA3 or CDNA4" )
882
+ @pytest .mark .xfail (not is_hip_cdna3 () and not is_hip_cdna4 (), reason = "Requires CDNA3 or CDNA4" , run = False )
883
883
def test_inline_with_amdgpu_dialect ():
884
884
885
885
@gluon .jit
0 commit comments