Skip to content

Commit 734b86e

Browse files
[Intel] Enable Gluon tests
Signed-off-by: Whitney Tsang <whitney.tsang@intel.com> Co-authored-by: Ilya Enkovich <ilya.enkovich@intel.com>
1 parent bd6de9e commit 734b86e

File tree

6 files changed

+58
-18
lines changed

6 files changed

+58
-18
lines changed

.github/workflows/build-test-reusable.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ jobs:
196196
suite:
197197
- minicore
198198
- scaled_dot
199+
- gluon
199200
- rest
200201
- tutorial-fa-64
201202
- tutorial-fa-128-fwdfp8
@@ -306,6 +307,11 @@ jobs:
306307
run: |
307308
${{ env.TRITON_TEST_CMD }} --scaled-dot
308309
310+
- name: Run gluon tests
311+
if: matrix.suite == 'gluon'
312+
run: |
313+
${{ env.TRITON_TEST_CMD }} --gluon
314+
309315
- name: Run interpreter tests
310316
if: matrix.suite == 'rest'
311317
run: |

python/test/gluon/test_core.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ def copy_kernel(Out, In, numel, XBLOCK: ttgl.constexpr, layout: ttgl.constexpr):
5555
ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[8], order=[0]),
5656
])
5757
@pytest.mark.parametrize("XBLOCK", [128, 256, 512, 1024, 2048])
58-
def test_copy_kernel(layout, XBLOCK):
59-
inp = torch.randn(XBLOCK * 4 - 7, device="cuda")
58+
def test_copy_kernel(layout, XBLOCK, device):
59+
inp = torch.randn(XBLOCK * 4 - 7, device=device)
6060
out = torch.empty_like(inp)
6161

6262
copy_kernel[(4, )](out, inp, inp.numel(), XBLOCK, layout, num_warps=layout.warps_per_cta[0])
@@ -113,8 +113,8 @@ def async_copy_mbarrier_kernel(out, inp, xnumel, XBLOCK: ttgl.constexpr, YBLOCK:
113113

114114

115115
@pytest.mark.skipif(not is_ampere_or_newer(), reason="Requires Ampere")
116-
def test_async_copy_mbarrier():
117-
tensor_opts = dict(dtype=torch.float, device="cuda")
116+
def test_async_copy_mbarrier(device):
117+
tensor_opts = dict(dtype=torch.float, device=device)
118118
out = torch.empty((32, 32), **tensor_opts)
119119
inp = torch.randn((20, 32), **tensor_opts)
120120
async_copy_mbarrier_kernel[(1, )](out, inp, inp.shape[0], XBLOCK=32, YBLOCK=32)
@@ -470,7 +470,7 @@ def make_finite(x, dtype):
470470
torch.testing.assert_close(z, z_ref, rtol=1e-5, atol=1e-5)
471471

472472

473-
def test_math_fast_expf():
473+
def test_math_fast_expf(device):
474474

475475
@gluon.jit
476476
def fast_expf_kernel(x_ptr, y_ptr, warp_size: ttgl.constexpr, num_warps: ttgl.constexpr):
@@ -484,13 +484,13 @@ def fast_expf_kernel(x_ptr, y_ptr, warp_size: ttgl.constexpr, num_warps: ttgl.co
484484
num_warps = 4
485485

486486
torch.manual_seed(0)
487-
x = torch.randn(THREADS_PER_WARP * num_warps, device="cuda", dtype=torch.float32)
487+
x = torch.randn(THREADS_PER_WARP * num_warps, device=device, dtype=torch.float32)
488488
y = torch.empty_like(x)
489489
fast_expf_kernel[(1, )](x, y, THREADS_PER_WARP, num_warps)
490490
torch.testing.assert_close(y, torch.exp(x), atol=1e-5, rtol=1e-4)
491491

492492

493-
def test_math_fast_dividef():
493+
def test_math_fast_dividef(device):
494494

495495
@gluon.jit
496496
def fast_dividef_kernel(x_ptr, y_ptr, z_ptr, warp_size: ttgl.constexpr, num_warps: ttgl.constexpr):
@@ -505,7 +505,7 @@ def fast_dividef_kernel(x_ptr, y_ptr, z_ptr, warp_size: ttgl.constexpr, num_warp
505505
num_warps = 4
506506

507507
torch.manual_seed(0)
508-
x = torch.randn(THREADS_PER_WARP * num_warps, device="cuda", dtype=torch.float32)
508+
x = torch.randn(THREADS_PER_WARP * num_warps, device=device, dtype=torch.float32)
509509
y = torch.randn_like(x)
510510
z = torch.empty_like(x)
511511
y[y == 0] = 1.0
@@ -734,7 +734,7 @@ def kernel(a_ptr, b_ptr, c_ptr, d_ptr):
734734
torch.testing.assert_close(d_ref, d_tri, rtol=0.08, atol=0)
735735

736736

737-
def test_slice_reinterpret():
737+
def test_slice_reinterpret(device):
738738
BLOCK = ttgl.constexpr(2048)
739739
SPLIT_BLOCK = ttgl.constexpr(BLOCK // 2)
740740
XBLOCK = ttgl.constexpr(32)
@@ -759,7 +759,7 @@ def kernel(in_ptr, out_ptr):
759759
value = smem_slice1.load(blocked)
760760
ttgl.store(ttgl.set_auto_layout(out_ptr + offs, blocked), value)
761761

762-
input = torch.randint(0, 100, (XBLOCK, YBLOCK), dtype=torch.int32, device="cuda")
762+
input = torch.randint(0, 100, (XBLOCK, YBLOCK), dtype=torch.int32, device=device)
763763
output = torch.empty_like(input)
764764
kernel[(1, )](input, output)
765765
torch.testing.assert_close(input, output, atol=0, rtol=0)
@@ -856,7 +856,7 @@ def early_return_kernel(x):
856856
return x
857857

858858

859-
def test_2d_tensor_early_return():
859+
def test_2d_tensor_early_return(device):
860860
warp_size = ttgl.constexpr(THREADS_PER_WARP)
861861

862862
@gluon.jit
@@ -871,7 +871,7 @@ def kernel(N, out):
871871
x += early_return_kernel(x)
872872
ttgl.store(out, x.sum(0).sum(0))
873873

874-
out = torch.empty(1, dtype=torch.int32, device="cuda")
874+
out = torch.empty(1, dtype=torch.int32, device=device)
875875
compiled_kernel = kernel.warmup(N=100, out=out, grid=(1, ))
876876
assert compiled_kernel.asm["llir"].count("define") == 1
877877

@@ -906,7 +906,8 @@ def kernel(x, y):
906906
{"offsets": [[0, 1], [0, 2], [0, 8], [0, 4], [0, 16], [0, 32], [2, 0], [1, 0], [4, 0], [8, 0], [16, 0], [32, 0]]}])
907907
@pytest.mark.parametrize("slice_m_offset, slice_n_offset, slice_m, slice_n", [(48, 16, 16, 16), (32, 48, 32, 16),
908908
(48, 32, 16, 32)])
909-
def test_padded_shared_layout_subslice(interval_pairs, shared_layout, slice_m_offset, slice_n_offset, slice_m, slice_n):
909+
def test_padded_shared_layout_subslice(interval_pairs, shared_layout, slice_m_offset, slice_n_offset, slice_m, slice_n,
910+
device):
910911
m = 64
911912
n = 64
912913
num_warps = 1
@@ -945,8 +946,8 @@ def kernel(in_ptr, out_ptr, M: ttgl.constexpr, N: ttgl.constexpr, SLICE_M_OFFSET
945946
out_offs = offs_m_store[:, None] * SLICE_N + offs_n_store[None, :]
946947
ttgl.store(out_ptr + out_offs, out_data)
947948

948-
input = torch.arange(m * n, device="cuda").reshape(m, n).to(torch.int32)
949-
output = torch.zeros((slice_m, slice_n), dtype=torch.int32, device="cuda")
949+
input = torch.arange(m * n, device=device).reshape(m, n).to(torch.int32)
950+
output = torch.zeros((slice_m, slice_n), dtype=torch.int32, device=device)
950951
ref_output = input[slice_m_offset:slice_m_offset + slice_m, slice_n_offset:slice_n_offset + slice_n]
951952

952953
kernel[(1, )](input, output, m, n, slice_m_offset, slice_n_offset, slice_m, slice_n, num_warps=num_warps)

python/test/gluon/test_lowerings.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import triton
55
from triton.experimental import gluon
66
from triton.experimental.gluon import language as ttgl
7-
from triton._internal_testing import is_cuda, is_hip, is_hopper_or_newer, get_hip_lds_size
7+
from triton._internal_testing import is_xpu, is_cuda, is_hip, is_hopper_or_newer, get_hip_lds_size
88

99

1010
def _is_layout_applicable(layout) -> bool:
@@ -152,6 +152,8 @@ def _reduce_layouts():
152152
for (M, N) in shapes:
153153
for layout in layouts:
154154
if isinstance(layout, (ttgl.amd.AMDMFMALayout, ttgl.NVMMADistributedLayout)):
155+
if is_xpu():
156+
continue
155157
instr_shape = layout.instr_shape
156158
if M < instr_shape[0] or N < instr_shape[1]:
157159
continue
@@ -587,6 +589,8 @@ def kernel(x_ptr, y_ptr, M: ttgl.constexpr, N: ttgl.constexpr, src_layout: ttgl.
587589
[pair for pair in _mma_pairs if all(_is_layout_applicable(layout) for layout in pair)])
588590
def test_convert_mma2mma_layouts(M, N, mma_pair, dtype, device):
589591
src_layout, dst_layout = mma_pair
592+
if is_xpu() and isinstance(src_layout, (ttgl.amd.AMDMFMALayout, ttgl.NVMMADistributedLayout)):
593+
pytest.xfail("AMD and NVIDIA MMA layouts are not supported on Intel GPUs")
590594

591595
@gluon.jit
592596
def kernel(x_ptr, y_ptr, M: ttgl.constexpr, N: ttgl.constexpr, src_layout: ttgl.constexpr,
@@ -790,7 +794,7 @@ def kernel(x_ptr, y_ptr, shape_tuple: ttgl.constexpr, src_layout: ttgl.constexpr
790794
y = torch.zeros_like(x)
791795
obj = kernel[(1, )](x, y, shape, dist_layout, blocked_layout, shared_layout, num_warps=num_warps)
792796
torch.testing.assert_close(y, x)
793-
if (isinstance(shared_layout, ttgl.NVMMASharedLayout) and dist_layout in _ld_st_mma_layouts
797+
if (is_cuda() and isinstance(shared_layout, ttgl.NVMMASharedLayout) and dist_layout in _ld_st_mma_layouts
794798
and dist_layout.version[0] >= 3 and dtype == "float16"):
795799
assert "stmatrix" in obj.asm["ptx"]
796800

@@ -1220,7 +1224,8 @@ def test_gather_layouts(axis, src_layout, index_layout, src_shape, idx_shape, de
12201224
raise RuntimeError(f"Unsupported shape: {src_shape}")
12211225

12221226
torch.testing.assert_close(out, ref, rtol=0, atol=0)
1223-
assert ("nvvm.shfl.sync.idx" in obj.asm["llir"]) or ("llvm.amdgcn.ds.bpermute" in obj.asm["llir"])
1227+
if is_cuda():
1228+
assert ("nvvm.shfl.sync.idx" in obj.asm["llir"]) or ("llvm.amdgcn.ds.bpermute" in obj.asm["llir"])
12241229

12251230

12261231
@pytest.mark.parametrize("M, N, M_tile_size, N_tile_size",

scripts/skiplist/default/gluon.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
python/test/gluon/test_core.py::test_2d_tensor_early_return
2+
python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
3+
python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
4+
python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
5+
python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp

scripts/test-triton.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ TEST:
1414
--minicore part of core
1515
--mxfp part of core
1616
--scaled-dot part of core
17+
--gluon
1718
--interpreter
1819
--benchmarks
1920
--softmax
@@ -55,6 +56,7 @@ TEST_CORE=false
5556
TEST_MINICORE=false
5657
TEST_MXFP=false
5758
TEST_SCALED_DOT=false
59+
TEST_GLUON=false
5860
TEST_INTERPRETER=false
5961
TEST_TUTORIAL=false
6062
TEST_MICRO_BENCHMARKS=false
@@ -106,6 +108,11 @@ while (( $# != 0 )); do
106108
TEST_DEFAULT=false
107109
shift
108110
;;
111+
--gluon)
112+
TEST_GLUON=true
113+
TEST_DEFAULT=false
114+
shift
115+
;;
109116
--interpreter)
110117
TEST_INTERPRETER=true
111118
TEST_DEFAULT=false
@@ -390,6 +397,16 @@ run_core_tests() {
390397
run_scaled_dot_tests
391398
}
392399

400+
run_gluon_tests() {
401+
echo "***************************************************"
402+
echo "****** Running Gluon tests ******"
403+
echo "***************************************************"
404+
cd $TRITON_PROJ/python/test/gluon
405+
406+
TRITON_TEST_SUITE=gluon \
407+
run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu test_lowerings.py
408+
}
409+
393410
run_interpreter_tests() {
394411
echo "***************************************************"
395412
echo "****** Running Triton Interpreter tests ******"
@@ -605,6 +622,9 @@ test_triton() {
605622
fi
606623
fi
607624

625+
if [ "$TEST_GLUON" == true ]; then
626+
run_gluon_tests
627+
fi
608628
if [ "$TEST_INTERPRETER" = true ]; then
609629
run_interpreter_tests
610630
fi

third_party/intel/backend/compiler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ def __init__(self, target: tuple) -> None:
122122
self.properties = self.parse_target(target.arch)
123123
self.binary_ext = "spv"
124124

125+
def get_target_name(self, options) -> str:
126+
return f"xpu:{self.device_arch}"
127+
125128
def parse_target(self, tgt_prop) -> dict:
126129
dev_prop = {}
127130
dev_prop['name'] = tgt_prop.get('name', 'xpu')

0 commit comments

Comments
 (0)