[Intel] Enable Gluon tests

whitneywhtsang · ienkovich · whitneywhtsang · commit 734b86e98e27 · 2025-09-19T13:50:57.000Z
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
Co-authored-by: Ilya Enkovich &lt;ilya.enkovich@intel.com&gt;
diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
@@ -196,6 +196,7 @@ jobs:
         suite:
           - minicore
           - scaled_dot
+          - gluon
           - rest
           - tutorial-fa-64
           - tutorial-fa-128-fwdfp8
@@ -306,6 +307,11 @@ jobs:
         run: |
           ${{ env.TRITON_TEST_CMD }} --scaled-dot
 
+      - name: Run gluon tests
+        if: matrix.suite == 'gluon'
+        run: |
+          ${{ env.TRITON_TEST_CMD }} --gluon
+
       - name: Run interpreter tests
         if: matrix.suite == 'rest'
         run: |
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -55,8 +55,8 @@ def copy_kernel(Out, In, numel, XBLOCK: ttgl.constexpr, layout: ttgl.constexpr):
     ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=[THREADS_PER_WARP], warps_per_cta=[8], order=[0]),
 ])
 @pytest.mark.parametrize("XBLOCK", [128, 256, 512, 1024, 2048])
-def test_copy_kernel(layout, XBLOCK):
-    inp = torch.randn(XBLOCK * 4 - 7, device="cuda")
+def test_copy_kernel(layout, XBLOCK, device):
+    inp = torch.randn(XBLOCK * 4 - 7, device=device)
     out = torch.empty_like(inp)
 
     copy_kernel[(4, )](out, inp, inp.numel(), XBLOCK, layout, num_warps=layout.warps_per_cta[0])
@@ -113,8 +113,8 @@ def async_copy_mbarrier_kernel(out, inp, xnumel, XBLOCK: ttgl.constexpr, YBLOCK:
 
 
 @pytest.mark.skipif(not is_ampere_or_newer(), reason="Requires Ampere")
-def test_async_copy_mbarrier():
-    tensor_opts = dict(dtype=torch.float, device="cuda")
+def test_async_copy_mbarrier(device):
+    tensor_opts = dict(dtype=torch.float, device=device)
     out = torch.empty((32, 32), **tensor_opts)
     inp = torch.randn((20, 32), **tensor_opts)
     async_copy_mbarrier_kernel[(1, )](out, inp, inp.shape[0], XBLOCK=32, YBLOCK=32)
@@ -470,7 +470,7 @@ def make_finite(x, dtype):
     torch.testing.assert_close(z, z_ref, rtol=1e-5, atol=1e-5)
 
 
-def test_math_fast_expf():
+def test_math_fast_expf(device):
 
     @gluon.jit
     def fast_expf_kernel(x_ptr, y_ptr, warp_size: ttgl.constexpr, num_warps: ttgl.constexpr):
@@ -484,13 +484,13 @@ def fast_expf_kernel(x_ptr, y_ptr, warp_size: ttgl.constexpr, num_warps: ttgl.co
     num_warps = 4
 
     torch.manual_seed(0)
-    x = torch.randn(THREADS_PER_WARP * num_warps, device="cuda", dtype=torch.float32)
+    x = torch.randn(THREADS_PER_WARP * num_warps, device=device, dtype=torch.float32)
     y = torch.empty_like(x)
     fast_expf_kernel[(1, )](x, y, THREADS_PER_WARP, num_warps)
     torch.testing.assert_close(y, torch.exp(x), atol=1e-5, rtol=1e-4)
 
 
-def test_math_fast_dividef():
+def test_math_fast_dividef(device):
 
     @gluon.jit
     def fast_dividef_kernel(x_ptr, y_ptr, z_ptr, warp_size: ttgl.constexpr, num_warps: ttgl.constexpr):
@@ -505,7 +505,7 @@ def fast_dividef_kernel(x_ptr, y_ptr, z_ptr, warp_size: ttgl.constexpr, num_warp
     num_warps = 4
 
     torch.manual_seed(0)
-    x = torch.randn(THREADS_PER_WARP * num_warps, device="cuda", dtype=torch.float32)
+    x = torch.randn(THREADS_PER_WARP * num_warps, device=device, dtype=torch.float32)
     y = torch.randn_like(x)
     z = torch.empty_like(x)
     y[y == 0] = 1.0
@@ -734,7 +734,7 @@ def kernel(a_ptr, b_ptr, c_ptr, d_ptr):
     torch.testing.assert_close(d_ref, d_tri, rtol=0.08, atol=0)
 
 
-def test_slice_reinterpret():
+def test_slice_reinterpret(device):
     BLOCK = ttgl.constexpr(2048)
     SPLIT_BLOCK = ttgl.constexpr(BLOCK // 2)
     XBLOCK = ttgl.constexpr(32)
@@ -759,7 +759,7 @@ def kernel(in_ptr, out_ptr):
         value = smem_slice1.load(blocked)
         ttgl.store(ttgl.set_auto_layout(out_ptr + offs, blocked), value)
 
-    input = torch.randint(0, 100, (XBLOCK, YBLOCK), dtype=torch.int32, device="cuda")
+    input = torch.randint(0, 100, (XBLOCK, YBLOCK), dtype=torch.int32, device=device)
     output = torch.empty_like(input)
     kernel[(1, )](input, output)
     torch.testing.assert_close(input, output, atol=0, rtol=0)
@@ -856,7 +856,7 @@ def early_return_kernel(x):
     return x
 
 
-def test_2d_tensor_early_return():
+def test_2d_tensor_early_return(device):
     warp_size = ttgl.constexpr(THREADS_PER_WARP)
 
     @gluon.jit
@@ -871,7 +871,7 @@ def kernel(N, out):
             x += early_return_kernel(x)
         ttgl.store(out, x.sum(0).sum(0))
 
-    out = torch.empty(1, dtype=torch.int32, device="cuda")
+    out = torch.empty(1, dtype=torch.int32, device=device)
     compiled_kernel = kernel.warmup(N=100, out=out, grid=(1, ))
     assert compiled_kernel.asm["llir"].count("define") == 1
 
@@ -906,7 +906,8 @@ def kernel(x, y):
      {"offsets": [[0, 1], [0, 2], [0, 8], [0, 4], [0, 16], [0, 32], [2, 0], [1, 0], [4, 0], [8, 0], [16, 0], [32, 0]]}])
 @pytest.mark.parametrize("slice_m_offset, slice_n_offset, slice_m, slice_n", [(48, 16, 16, 16), (32, 48, 32, 16),
                                                                               (48, 32, 16, 32)])
-def test_padded_shared_layout_subslice(interval_pairs, shared_layout, slice_m_offset, slice_n_offset, slice_m, slice_n):
+def test_padded_shared_layout_subslice(interval_pairs, shared_layout, slice_m_offset, slice_n_offset, slice_m, slice_n,
+                                       device):
     m = 64
     n = 64
     num_warps = 1
@@ -945,8 +946,8 @@ def kernel(in_ptr, out_ptr, M: ttgl.constexpr, N: ttgl.constexpr, SLICE_M_OFFSET
         out_offs = offs_m_store[:, None] * SLICE_N + offs_n_store[None, :]
         ttgl.store(out_ptr + out_offs, out_data)
 
-    input = torch.arange(m * n, device="cuda").reshape(m, n).to(torch.int32)
-    output = torch.zeros((slice_m, slice_n), dtype=torch.int32, device="cuda")
+    input = torch.arange(m * n, device=device).reshape(m, n).to(torch.int32)
+    output = torch.zeros((slice_m, slice_n), dtype=torch.int32, device=device)
     ref_output = input[slice_m_offset:slice_m_offset + slice_m, slice_n_offset:slice_n_offset + slice_n]
 
     kernel[(1, )](input, output, m, n, slice_m_offset, slice_n_offset, slice_m, slice_n, num_warps=num_warps)
diff --git a/python/test/gluon/test_lowerings.py b/python/test/gluon/test_lowerings.py
@@ -4,7 +4,7 @@
 import triton
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
-from triton._internal_testing import is_cuda, is_hip, is_hopper_or_newer, get_hip_lds_size
+from triton._internal_testing import is_xpu, is_cuda, is_hip, is_hopper_or_newer, get_hip_lds_size
 
 
 def _is_layout_applicable(layout) -> bool:
@@ -152,6 +152,8 @@ def _reduce_layouts():
     for (M, N) in shapes:
         for layout in layouts:
             if isinstance(layout, (ttgl.amd.AMDMFMALayout, ttgl.NVMMADistributedLayout)):
+                if is_xpu():
+                    continue
                 instr_shape = layout.instr_shape
                 if M < instr_shape[0] or N < instr_shape[1]:
                     continue
@@ -587,6 +589,8 @@ def kernel(x_ptr, y_ptr, M: ttgl.constexpr, N: ttgl.constexpr, src_layout: ttgl.
                          [pair for pair in _mma_pairs if all(_is_layout_applicable(layout) for layout in pair)])
 def test_convert_mma2mma_layouts(M, N, mma_pair, dtype, device):
     src_layout, dst_layout = mma_pair
+    if is_xpu() and isinstance(src_layout, (ttgl.amd.AMDMFMALayout, ttgl.NVMMADistributedLayout)):
+        pytest.xfail("AMD and NVIDIA MMA layouts are not supported on Intel GPUs")
 
     @gluon.jit
     def kernel(x_ptr, y_ptr, M: ttgl.constexpr, N: ttgl.constexpr, src_layout: ttgl.constexpr,
@@ -790,7 +794,7 @@ def kernel(x_ptr, y_ptr, shape_tuple: ttgl.constexpr, src_layout: ttgl.constexpr
     y = torch.zeros_like(x)
     obj = kernel[(1, )](x, y, shape, dist_layout, blocked_layout, shared_layout, num_warps=num_warps)
     torch.testing.assert_close(y, x)
-    if (isinstance(shared_layout, ttgl.NVMMASharedLayout) and dist_layout in _ld_st_mma_layouts
+    if (is_cuda() and isinstance(shared_layout, ttgl.NVMMASharedLayout) and dist_layout in _ld_st_mma_layouts
             and dist_layout.version[0] >= 3 and dtype == "float16"):
         assert "stmatrix" in obj.asm["ptx"]
 
@@ -1220,7 +1224,8 @@ def test_gather_layouts(axis, src_layout, index_layout, src_shape, idx_shape, de
         raise RuntimeError(f"Unsupported shape: {src_shape}")
 
     torch.testing.assert_close(out, ref, rtol=0, atol=0)
-    assert ("nvvm.shfl.sync.idx" in obj.asm["llir"]) or ("llvm.amdgcn.ds.bpermute" in obj.asm["llir"])
+    if is_cuda():
+        assert ("nvvm.shfl.sync.idx" in obj.asm["llir"]) or ("llvm.amdgcn.ds.bpermute" in obj.asm["llir"])
 
 
 @pytest.mark.parametrize("M, N, M_tile_size, N_tile_size",
diff --git a/scripts/skiplist/default/gluon.txt b/scripts/skiplist/default/gluon.txt
@@ -0,0 +1,5 @@
+python/test/gluon/test_core.py::test_2d_tensor_early_return
+python/test/gluon/test_lowerings.py::test_histogram[2048-2-src_layout3-dst_layout3]
+python/test/gluon/test_lowerings.py::test_histogram[32-32-src_layout4-dst_layout4]
+python/test/gluon/test_lowerings.py::test_scan_layouts[r"True-.*"]@regexp
+python/test/gluon/test_lowerings.py::test_reduce_layouts[r".*-True-.*"]@regexp
diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh
@@ -14,6 +14,7 @@ TEST:
     --minicore        part of core
     --mxfp            part of core
     --scaled-dot      part of core
+    --gluon
     --interpreter
     --benchmarks
     --softmax
@@ -55,6 +56,7 @@ TEST_CORE=false
 TEST_MINICORE=false
 TEST_MXFP=false
 TEST_SCALED_DOT=false
+TEST_GLUON=false
 TEST_INTERPRETER=false
 TEST_TUTORIAL=false
 TEST_MICRO_BENCHMARKS=false
@@ -106,6 +108,11 @@ while (( $# != 0 )); do
       TEST_DEFAULT=false
       shift
       ;;
+    --gluon)
+      TEST_GLUON=true
+      TEST_DEFAULT=false
+      shift
+      ;;
     --interpreter)
       TEST_INTERPRETER=true
       TEST_DEFAULT=false
@@ -390,6 +397,16 @@ run_core_tests() {
   run_scaled_dot_tests
 }
 
+run_gluon_tests() {
+  echo "***************************************************"
+  echo "******         Running Gluon tests          ******"
+  echo "***************************************************"
+  cd $TRITON_PROJ/python/test/gluon
+
+  TRITON_TEST_SUITE=gluon \
+    run_pytest_command -vvv -n ${PYTEST_MAX_PROCESSES:-8} --device xpu test_lowerings.py
+}
+
 run_interpreter_tests() {
   echo "***************************************************"
   echo "******   Running Triton Interpreter tests    ******"
@@ -605,6 +622,9 @@ test_triton() {
     fi
   fi
 
+  if [ "$TEST_GLUON" == true ]; then
+    run_gluon_tests
+  fi
   if [ "$TEST_INTERPRETER" = true ]; then
     run_interpreter_tests
   fi
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -122,6 +122,9 @@ def __init__(self, target: tuple) -> None:
         self.properties = self.parse_target(target.arch)
         self.binary_ext = "spv"
 
+    def get_target_name(self, options) -> str:
+        return f"xpu:{self.device_arch}"
+
     def parse_target(self, tgt_prop) -> dict:
         dev_prop = {}
         dev_prop['name'] = tgt_prop.get('name', 'xpu')