vllm-project · bringlein · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
@@ -12,6 +12,7 @@
 import triton.language as tl
 
 from vllm.logger import init_logger
+from vllm.triton_utils.jit_cache import jitcache
 
 logger = init_logger(__name__)
 
@@ -47,6 +48,24 @@ def find_seq_idx(query_start_len_ptr, target_idx, num_seqs,
     return left - 1
 
 
+@jitcache(
+    check_keys=[],
+    check_specialization=["num_seqs"],
+    assume_const=[
+        "scale",
+        "k_scale",
+        "v_scale",
+        "query_stride_1",
+        "output_stride_1",
+        "stride_k_cache_0",
+        "stride_k_cache_1",
+        "stride_k_cache_2",
+        "stride_k_cache_4",
+        "stride_v_cache_0",
+        "stride_v_cache_1",
+        "stride_v_cache_2",
+    ],
+)
 @triton.jit
 def kernel_unified_attention_2d(
         output_ptr,  # [num_tokens, num_query_heads, head_size]

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -76,6 +76,7 @@
     VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
+    VLLM_TRITON_ENABLE_JITCACHE: bool = False
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -589,6 +590,11 @@ def get_vllm_port() -> Optional[int]:
     lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
              .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
 
+    # Enable the JITCache for Triton Kernels
+    # see triton_utils/jitcache.py
+    "VLLM_TRITON_ENABLE_JITCACHE":
+    lambda: bool(int(os.getenv("VLLM_TRITON_ENABLE_JITCACHE", "0"))),
+
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ":
     lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),