triton-lang
diff --git a/‎python/test/unit/language/test_frontend.py‎
Lines changed: 6 additions & 6 deletions b/‎python/test/unit/language/test_frontend.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎python/test/unit/language/test_tuple.py‎
Lines changed: 8 additions & 2 deletions b/‎python/test/unit/language/test_tuple.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎python/test/unit/runtime/test_cache.py‎
Lines changed: 0 additions & 36 deletions b/‎python/test/unit/runtime/test_cache.py‎
Lines changed: 0 additions & 36 deletions
diff --git a/‎python/triton/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎python/triton/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/triton/compiler/code_generator.py‎
Lines changed: 5 additions & 5 deletions b/‎python/triton/compiler/code_generator.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎python/triton/experimental/gluon/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎python/triton/experimental/gluon/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/triton/experimental/gluon/_runtime.py‎
Lines changed: 1 addition & 3 deletions b/‎python/triton/experimental/gluon/_runtime.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎python/triton/experimental/gluon/language/_core.py‎
Lines changed: 2 additions & 0 deletions b/‎python/triton/experimental/gluon/language/_core.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/_layouts.py‎
Lines changed: 0 additions & 3 deletions b/‎python/triton/experimental/gluon/language/_layouts.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py‎
Lines changed: 6 additions & 11 deletions b/‎python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py‎
Lines changed: 6 additions & 11 deletions
@@ -308,7 +308,7 @@ def test_aggregate_with_constexpr():
     # CHECK: arith.addi %arg0, %cst : tensor<4xi32>
 
 
-@triton.constexpr_function
+@tl.constexpr_function
 def constexpr_function(x):
     return x + 1
 
@@ -345,12 +345,12 @@ def test_reassign_aggregate_with_constexpr():
     agg = agg.modify(tl.arange(4, 8))
 
 
-@triton.constexpr_function
+@tl.constexpr_function
 def make_shape(m, n):
     return (m, n)
 
 
-@triton.constexpr_function
+@tl.constexpr_function
 def add_shape_dims(m, n):
     return m + n
 
@@ -365,7 +365,7 @@ def test_constexpr_getitem():
     tl.arange(4, sum)
 
 
-@triton.constexpr_function
+@tl.constexpr_function
 def make_constexpr_closure(x):
     x = tl.constexpr(x)
 
@@ -386,7 +386,7 @@ def test_constexpr_closure():
     closure((128, 128))
 
 
-@triton.constexpr_function
+@tl.constexpr_function
 def make_constexpr_generator(f):
     f = tl.constexpr(f)
 
@@ -422,7 +422,7 @@ def test_constexpr_generator():
     generator(lhs)
 
 
-@triton.constexpr_function
+@tl.constexpr_function
 def Box(T):
 
     @tl.core._aggregate
 
@@ -217,17 +217,23 @@ def m_to_the_n(X, shape: tl.constexpr, strides, m_n):
 
 def test_passing_tuple_to_make_tensor_descriptor(device, with_allocator):
 
+    from triton.language.core import builtin
+
+    @builtin
+    def is_constexpr(v, _semantic=None):
+        return isinstance(v, tl.constexpr)
+
     @triton.jit
     def m_to_the_n(X_base, shape, strides, m_n, BLOCK_DIM: tl.constexpr):
-        tl.static_assert(isinstance(strides[1].type, tl.constexpr_type))
+        tl.static_assert(is_constexpr(strides[1]))
         X = tl.make_tensor_descriptor(
             X_base,
             shape=shape,
             strides=strides,
             block_shape=[BLOCK_DIM, BLOCK_DIM],
         )
         # Make sure tl.make_tensor_descriptor didn't modify strides (i.e. didn't unwrap the constexpr)
-        tl.static_assert(isinstance(strides[1].type, tl.constexpr_type))
+        tl.static_assert(is_constexpr(strides[1]))
         data = X.load([0, 0])
         # Include a for loop to ensure strides[1] is lifted into a constexpr
         # (otherwise cloning the local scope will fail).
 
@@ -127,42 +127,6 @@ def test_combine_fn_change():
         seen_keys.add(key)
 
 
-@triton.constexpr_function
-def constexpr_flag_fn():
-    return False
-
-
-@triton.jit
-def constexpr_fn_user(out):
-    a: tl.constexpr = constexpr_flag_fn()
-    tl.store(out, a)
-
-
-def test_constexpr_fn_change():
-    baseline = constexpr_fn_user.cache_key
-
-    orig_src = constexpr_flag_fn.src
-    new_src = orig_src.replace("False", "True")
-    constexpr_flag_fn._unsafe_update_src(new_src)
-    constexpr_fn_user.hash = None
-    updated = constexpr_fn_user.cache_key
-    assert baseline != updated
-
-    constexpr_flag_fn._unsafe_update_src(orig_src)
-    constexpr_fn_user.hash = None
-    assert constexpr_fn_user.cache_key == baseline
-
-
-@triton.constexpr_function
-def invalid_constexpr_fn():
-    return torch.cuda.get_device_capability()
-
-
-def test_invalid_constexpr_fn():
-    with pytest.raises(RuntimeError):
-        invalid_constexpr_fn.cache_key
-
-
 def write_and_load_module(temp_file: pathlib.Path, code, num_extra_lines):
     temp_file.write_text(('# extra line\n' * num_extra_lines) + code)
     spec = importlib.util.spec_from_file_location("module.name", str(temp_file))
 
@@ -17,7 +17,7 @@
     InterpreterError,
     MockTensor,
 )
-from .runtime.jit import constexpr_function, jit
+from .runtime.jit import jit
 from .runtime._async_compile import AsyncCompileMode, FutureKernel
 from .compiler import compile, CompilationError
 from .errors import TritonError
@@ -36,7 +36,6 @@
     "CompilationError",
     "compile",
     "Config",
-    "constexpr_function",
     "FutureKernel",
     "heuristics",
     "InterpreterError",
 
@@ -14,8 +14,9 @@
 from .._C.libtriton import ir, gluon_ir
 from ..language import constexpr, str_to_ty, tensor, tuple as tl_tuple
 from ..language.core import _unwrap_if_constexpr, base_value, base_type
+from ..runtime.jit import get_jit_fn_file_line, get_full_name
 # ideally we wouldn't need any runtime component
-from ..runtime.jit import get_jit_fn_file_line, get_full_name, JITCallable, ConstexprFunction, JITFunction
+from ..runtime import JITFunction
 from .._utils import find_paths_if, get_iterable_path, set_iterable_path
 
 from .errors import (CompilationError, CompileTimeAssertionFailure, UnsupportedLanguageConstruct)
@@ -51,7 +52,7 @@ def _is_triton_tensor(o: Any) -> bool:
 
 
 def _is_constexpr(o: Any) -> bool:
-    return o is None or isinstance(o, (constexpr, language.core.dtype, JITCallable))
+    return o is None or isinstance(o, (constexpr, language.core.dtype, JITFunction))
 
 
 def _is_non_scalar_tensor(o: Any) -> bool:
@@ -395,7 +396,7 @@ def global_lookup(name: str, absent):
                     val is absent,
                     name in self.builtin_namespace,  #
                     type(val) is ModuleType,  #
-                    isinstance(val, JITCallable),  #
+                    isinstance(val, JITFunction),  #
                     getattr(val, "__triton_builtin__", False),  #
                     getattr(val, "__triton_aggregate__", False),  #
                     getattr(val, "__module__", "").startswith("triton.language"),  #
@@ -1322,8 +1323,7 @@ def call_Function(self, node, fn, args, kws):
         if isinstance(fn, JITFunction):
             _check_fn_args(node, fn, args)
             return self.call_JitFunction(fn, args, kws)
-        if (hasattr(fn, '__self__') and _is_triton_value(fn.__self__)) or language.core.is_builtin(fn) or isinstance(
-                fn, ConstexprFunction):
+        if (hasattr(fn, '__self__') and _is_triton_value(fn.__self__)) or language.core.is_builtin(fn):
             extra_kwargs = dict()
             sig = inspect.signature(fn)
             if '_semantic' in sig.parameters:
 
@@ -1,5 +1,5 @@
 from . import nvidia
-from ._runtime import constexpr_function, jit
+from ._runtime import jit
 from triton.language.core import must_use_result
 
-__all__ = ["constexpr_function", "jit", "must_use_result", "nvidia"]
+__all__ = ["jit", "must_use_result", "nvidia"]
@@ -1,14 +1,12 @@
 from __future__ import annotations
 from triton.compiler.compiler import ASTSource
 from triton.backends.compiler import Language
-from triton.runtime.jit import JITFunction, constexpr_function
+from triton.runtime.jit import JITFunction
 from typing import TypeVar, Optional, Callable, Iterable, Union
 from triton._C.libtriton import ir
 
 T = TypeVar("T")
 
-__all__ = ["constexpr_function", "jit"]
-
 
 class GluonASTSource(ASTSource):
 
 
@@ -12,6 +12,7 @@
 import triton.language.core as tl_core
 from triton.language.core import (
     constexpr,
+    constexpr_function,
     base_value,
     base_type,
     dtype,
@@ -78,6 +79,7 @@
 
 __all__ = [
     "constexpr",
+    "constexpr_function",
     "base_value",
     "base_type",
     "dtype",
 
@@ -1,7 +1,6 @@
 from dataclasses import dataclass
 from typing import List, Optional
 from triton.language.core import _unwrap_if_constexpr, _unwrap_shape, constexpr_type
-from triton.runtime.jit import constexpr_function
 
 __all__ = [
     "AutoLayout",
@@ -261,7 +260,6 @@ def type(self):
         return constexpr_type(self)
 
 
-@constexpr_function
 def _get_shape_per_cta(shape, cta_split_num):
     shape_per_cta = shape
     if cta_split_num is not None:
@@ -325,7 +323,6 @@ def _to_ir(self, builder):
         )
 
     @staticmethod
-    @constexpr_function
     def get_default_for(block_shape, dtype, transposed=False, fp4_padded=False, ctas_per_cga=None, cta_split_num=None,
                         cta_order=None):
         """Returns an NVMMASharedLayout with default swizzling for a given shape.
 
@@ -2,9 +2,9 @@
 from typing import Optional, Tuple, List, TYPE_CHECKING
 
 from dataclasses import dataclass
-from triton.runtime.jit import constexpr_function
+import triton
 from triton.experimental.gluon.language import _core as ttgl
-from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
+from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr, constexpr_function
 from triton.experimental.gluon.language._layouts import BlockedLayout, _get_shape_per_cta
 from triton.experimental.gluon.language._semantic import _check
 
@@ -62,11 +62,6 @@ def mangle(self) -> str:
         return f"TL{block_str}{unpacked_str}{cta_split_str}TL"
 
 
-@constexpr_function
-def _cdiv(x, div):
-    return (x + div - 1) // div
-
-
 @constexpr_function
 def get_tmem_32x32b_reg_layout(M, N, shape, num_warps, ctas_per_cga=None, cta_split_num=None, cta_order=None):
     """Returns a BlockedLayout compatible with load/store on tensor memory with the 32x32b instruction variant.
@@ -82,19 +77,19 @@ def get_tmem_32x32b_reg_layout(M, N, shape, num_warps, ctas_per_cga=None, cta_sp
     if M == 64:
         threads_per_warp = [16, 2]
         if num_blocks == 1:
-            size_per_thread = [1, _cdiv(N, num_warp_groups * 2)]
+            size_per_thread = [1, triton.cdiv(N, num_warp_groups * 2)]
             warps_per_cta = [4, num_warp_groups]
         else:
-            size_per_thread = [1, _cdiv(N, 2)]
+            size_per_thread = [1, triton.cdiv(N, 2)]
             warps_per_cta = [4 * min(blocks_per_tile[0], num_warp_groups)]
-            warps_per_cta.append(_cdiv(num_warp_groups, warps_per_cta[0] // 4))
+            warps_per_cta.append(triton.cdiv(num_warp_groups, warps_per_cta[0] // 4))
     else:
         if shape[0] > 128:
             size_per_thread = [1, N]
             threads_per_warp = [32, 1]
             warps_per_cta = [4 * num_warp_groups, 1]
         else:
-            size_per_thread = [1, _cdiv(N, num_warp_groups)]
+            size_per_thread = [1, triton.cdiv(N, num_warp_groups)]
             threads_per_warp = [32, 1]
             warps_per_cta = [4, num_warp_groups]
     return BlockedLayout(