Merge branch 'main' into gqa1020

tzj-fxz · tzj-fxz · commit 32b0d41e2a4b · 2025-10-21T11:27:29.000Z
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -158,22 +158,26 @@ endif()
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
 
 add_custom_command(
-  OUTPUT "${CMAKE_BINARY_DIR}/cython_wrapper.cpp"
+  OUTPUT "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp"
   COMMENT
     "Cythoning tilelang/jit/adapter/cython/cython_wrapper.pyx"
   COMMAND Python::Interpreter -m cython
           "${CMAKE_CURRENT_SOURCE_DIR}/tilelang/jit/adapter/cython/cython_wrapper.pyx"
-          --cplus --output-file "${CMAKE_BINARY_DIR}/cython_wrapper.cpp"
+          --module-name tilelang_cython_wrapper
+          --cplus --output-file "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp"
   DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/tilelang/jit/adapter/cython/cython_wrapper.pyx"
   VERBATIM)
 
 if(NOT "${SKBUILD_SABI_VERSION}" STREQUAL "")
   set(USE_SABI USE_SABI ${SKBUILD_SABI_VERSION})
 endif()
 
-python_add_library(cython_wrapper MODULE "${CMAKE_BINARY_DIR}/cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
-# Install to site dir to support direct import
-install(TARGETS cython_wrapper LIBRARY DESTINATION .)
+python_add_library(tilelang_cython_wrapper MODULE "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
+# Install extension into the tilelang package directory
+install(TARGETS tilelang_cython_wrapper
+        LIBRARY DESTINATION tilelang
+        RUNTIME DESTINATION tilelang
+        ARCHIVE DESTINATION tilelang)
 
 # let libtilelang to search tvm/tvm_runtime in same dir
 if(APPLE)
diff --git a/docs/get_started/targets.md b/docs/get_started/targets.md
@@ -0,0 +1,120 @@
+# Understanding Targets
+
+TileLang is built on top of TVM, which relies on **targets** to describe the device you want to compile for.
+The target determines which code generator is used (CUDA, HIP, Metal, LLVM, …) and allows you to pass
+device-specific options such as GPU architecture flags. This page summarises how to pick and customise a target
+when compiling TileLang programs.
+
+## Common target strings
+
+TileLang ships with a small set of common targets; each accepts the full range of TVM options so you can fine-tune
+the generated code. The most frequent choices are listed below:
+
+| Base name | Description |
+| --------- | ----------- |
+| `auto` | Detects CUDA → HIP → Metal in that order. Useful when running the same script across machines. |
+| `cuda` | NVIDIA GPUs. Supports options such as `-arch=sm_80`, `-max_num_threads=1024`, etc. |
+| `hip` | AMD GPUs via ROCm. Options like `-mcpu=gfx90a` can be appended. |
+| `metal` | Apple Silicon GPUs (arm64 Macs). |
+| `llvm` | CPU execution; accepts the standard TVM LLVM switches. |
+| `webgpu` | Browser / WebGPU runtimes. |
+| `c` | Emit plain C source for inspection or custom toolchains. |
+
+To add options, append them after the base name, separated by spaces. For example:
+
+```python
+target = "cuda -arch=sm_90"
+kernel = tilelang.compile(func, target=target, execution_backend="cython")
+# or
+@tilelang.jit(target=target)
+def compiled_kernel(*args):
+    return func(*args)
+```
+
+The same convention works for HIP or LLVM (e.g. `hip -mcpu=gfx940`, `llvm -mtriple=x86_64-linux-gnu`).
+
+### Advanced: Specify Exact Hardware
+
+When you already know the precise GPU model, you can encode it in the target string—either via `-arch=sm_XX` or by
+using one of TVM’s pre-defined target tags such as `nvidia/nvidia-h100`.  Supplying this detail is optional for
+TileLang in general use, but it becomes valuable when the TVM cost model is enabled (e.g. during autotuning).  The
+cost model uses the extra attributes to make better scheduling predictions.  If you skip this step (or do not use the
+cost model), generic targets like `cuda` or `auto` are perfectly fine.
+
+All CUDA compute capabilities recognised by TVM’s target registry are listed below.  Pick the one that matches your
+GPU and append it to the target string or use the corresponding target tag—for example `nvidia/nvidia-a100`.
+
+| Architecture | GPUs (examples) |
+| ------------ | ---------------- |
+| `sm_20` | `nvidia/tesla-c2050`, `nvidia/tesla-c2070` |
+| `sm_21` | `nvidia/nvs-5400m`, `nvidia/geforce-gt-520` |
+| `sm_30` | `nvidia/quadro-k5000`, `nvidia/geforce-gtx-780m` |
+| `sm_35` | `nvidia/tesla-k40`, `nvidia/quadro-k6000` |
+| `sm_37` | `nvidia/tesla-k80` |
+| `sm_50` | `nvidia/quadro-k2200`, `nvidia/geforce-gtx-950m` |
+| `sm_52` | `nvidia/tesla-m40`, `nvidia/geforce-gtx-980` |
+| `sm_53` | `nvidia/jetson-tx1`, `nvidia/jetson-nano` |
+| `sm_60` | `nvidia/tesla-p100`, `nvidia/quadro-gp100` |
+| `sm_61` | `nvidia/tesla-p4`, `nvidia/quadro-p6000`, `nvidia/geforce-gtx-1080` |
+| `sm_62` | `nvidia/jetson-tx2` |
+| `sm_70` | `nvidia/nvidia-v100`, `nvidia/quadro-gv100` |
+| `sm_72` | `nvidia/jetson-agx-xavier` |
+| `sm_75` | `nvidia/nvidia-t4`, `nvidia/quadro-rtx-8000`, `nvidia/geforce-rtx-2080` |
+| `sm_80` | `nvidia/nvidia-a100`, `nvidia/nvidia-a30` |
+| `sm_86` | `nvidia/nvidia-a40`, `nvidia/nvidia-a10`, `nvidia/geforce-rtx-3090` |
+| `sm_87` | `nvidia/jetson-agx-orin-32gb`, `nvidia/jetson-agx-orin-64gb` |
+| `sm_89` | `nvidia/geforce-rtx-4090` |
+| `sm_90a` | `nvidia/nvidia-h100` (DPX profile) |
+| `sm_100a` | `nvidia/nvidia-b100` |
+
+Refer to NVIDIA’s [CUDA GPUs](https://developer.nvidia.com/cuda-gpus) page or the TVM source
+(`3rdparty/tvm/src/target/tag.cc`) for the latest mapping between devices and compute capabilities.
+
+## Creating targets programmatically
+
+If you prefer working with TVM’s `Target` objects, TileLang exposes the helper
+`tilelang.utils.target.determine_target` (returns a canonical target string by default, or the `Target`
+object when `return_object=True`):
+
+```python
+from tilelang.utils.target import determine_target
+
+tvm_target = determine_target("cuda -arch=sm_80", return_object=True)
+kernel = tilelang.compile(func, target=tvm_target)
+```
+
+You can also build targets directly through TVM:
+
+```python
+from tvm.target import Target
+
+target = Target("cuda", host="llvm")
+target = target.with_host(Target("llvm -mcpu=skylake"))
+```
+
+TileLang accepts either `str` or `Target` inputs; internally they are normalised and cached using the canonical
+string representation.  **In user code we strongly recommend passing target strings rather than
+`tvm.target.Target` instances—strings keep cache keys compact and deterministic across runs, whereas constructing
+fresh `Target` objects may lead to slightly higher hashing overhead or inconsistent identity semantics.**
+
+## Discovering supported targets in code
+
+Looking for a quick reminder of the built-in base names and their descriptions? Use:
+
+```python
+from tilelang.utils.target import describe_supported_targets
+
+for name, doc in describe_supported_targets().items():
+    print(f"{name:>6}: {doc}")
+```
+
+This helper mirrors the table above and is safe to call at runtime (for example when validating CLI arguments).
+
+## Troubleshooting tips
+
+- If you see `Target cuda -arch=sm_80 is not supported`, double-check the spellings and that the option is valid for
+  TVM. Any invalid switch will surface as a target-construction error.
+- Runtime errors such as “no kernel image is available” usually mean the `-arch` flag does not match the GPU you are
+  running on. Try dropping the flag or switching to the correct compute capability.
+- When targeting multiple environments, use `auto` for convenience and override with an explicit string only when
+  you need architecture-specific tuning.
diff --git a/docs/index.md b/docs/index.md
@@ -14,6 +14,7 @@ low-level optimizations necessary for state-of-the-art performance.
 
 get_started/Installation
 get_started/overview
+get_started/targets
 :::
 
 
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -11,8 +11,6 @@
 
 from heuristic import num_splits_heuristic
 
-tilelang.disable_cache()
-
 
 def flashattn(batch, heads, heads_kv, dim, dim_v):
     scale = (1.0 / dim)**0.5 * 1.44269504  # log2(e)
diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
@@ -4,8 +4,6 @@
 from typing import Tuple
 from tilelang.utils.tensor import torch_assert_close
 
-tilelang.disable_cache()
-
 
 @tilelang.jit(out_idx=[1, 2])
 def per_token_cast_to_fp8(M, N, blk_m):
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -5,8 +5,6 @@
 from einops import rearrange, einsum
 import argparse
 
-tilelang.disable_cache()
-
 
 def get_configs():
     import itertools
diff --git a/examples/elementwise/example_elementwise_add.py b/examples/elementwise/example_elementwise_add.py
@@ -5,8 +5,6 @@
 import tilelang.language as T
 from tilelang.autotuner import AutoTuner
 
-tilelang.disable_cache()
-
 
 def ref_program(x, y):
     return x + y
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -7,7 +7,6 @@
 from einops import rearrange, repeat
 from bert_padding import pad_input, unpad_input
 
-# tilelang.disable_cache()
 torch.manual_seed(1)
 
 
diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
@@ -24,8 +24,6 @@
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
 
-tilelang.disable_cache()
-
 from utils import *
 
 
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
@@ -32,8 +32,6 @@
 
 torch.random.manual_seed(0)
 
-tilelang.disable_cache()
-
 
 def prepare_input(
     B,
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
@@ -19,8 +19,6 @@
 
 torch.random.manual_seed(1)
 
-tilelang.disable_cache()
-
 
 def prepare_input(
     B,
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
@@ -26,8 +26,6 @@
 torch.random.manual_seed(0)
 # torch.set_printoptions(profile="full")
 
-tilelang.disable_cache()
-
 
 def prepare_input_fake(
     B,
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -20,8 +20,6 @@
 torch.set_printoptions(profile="full")
 torch.random.manual_seed(0)
 
-tilelang.disable_cache()
-
 
 def prepare_input(
     B,
diff --git a/examples/gdn/example_cumsum.py b/examples/gdn/example_cumsum.py
@@ -18,8 +18,6 @@
 
 import torch
 
-tilelang.disable_cache()
-
 
 @tilelang.jit(
     out_idx=[-1],
diff --git a/examples/gdn/example_wy_fast.py b/examples/gdn/example_wy_fast.py
@@ -19,8 +19,6 @@
 
 torch.random.manual_seed(1)
 
-tilelang.disable_cache()
-
 
 def prepare_input(B, S, H, DK, DV, chunk_size, input_dtype, output_dtype, gate_dtype=torch.float32):
     BS = chunk_size
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
@@ -22,8 +22,6 @@
 torch.random.manual_seed(0)
 torch.set_printoptions(profile="full")
 
-tilelang.disable_cache()
-
 
 def prepare_input_fake(
     B,
diff --git a/examples/gemm_sm100/gemm_tcgen5mma.py b/examples/gemm_sm100/gemm_tcgen5mma.py
@@ -2,8 +2,6 @@
 import tilelang
 import tilelang.language as T
 
-tilelang.disable_cache()
-
 
 def matmul(
     M,
diff --git a/examples/grouped_gemm/example_grouped_gemm_bwd.py b/examples/grouped_gemm/example_grouped_gemm_bwd.py
@@ -4,8 +4,6 @@
 import tilelang
 import tilelang.language as T
 
-tilelang.disable_cache()
-
 
 @tilelang.jit(
     out_idx=[2], pass_configs={
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -12,8 +12,6 @@
 import tilelang.language as T
 from tilelang.profiler import do_bench
 
-tilelang.disable_cache()
-
 
 @tilelang.jit(out_idx=[3])
 def _tl_vs_sparse_flashattn(batch, heads, seq_len, dim, vertical_size, slash_size):
diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -6,8 +6,6 @@
 from einops import rearrange, einsum
 import argparse
 
-tilelang.disable_cache()
-
 
 @tilelang.jit(out_idx=[6])
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split):
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
@@ -1,8 +1,6 @@
 import tilelang
 import tilelang.language as T
 
-tilelang.disable_cache()
-
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
@@ -50,8 +50,6 @@ def main(M=16384, N=16384, K=16384):
 
     jit_kernel = matmul(M, N, K, block_M, block_N, block_K)
 
-    tilelang.disable_cache()
-
     # 3. Test the kernel in Python with PyTorch data
     import torch
 
diff --git a/maint/precision/compare_ops.py b/maint/precision/compare_ops.py
@@ -17,8 +17,6 @@
 import tilelang
 import tilelang.language as T
 
-tilelang.disable_cache()
-
 from tilelang.contrib import nvcc
 from tilelang.utils.target import determine_target
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -74,7 +74,6 @@ column_limit = 100
 indent_width = 4
 
 [tool.codespell]
-builtin = "clear,rare,en-GB_to_en-US"
 ignore-words = "docs/spelling_wordlist.txt"
 skip = [
     "build",
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
@@ -33,6 +33,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kEnablePTXASVerboseOutput, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableVectorize256, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWGMMA, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kStorageRewriteDetectInplace, Bool);
 
 DataType cuTensorMapType() { return DataType::UInt(8, 128); }
 
diff --git a/src/op/builtin.h b/src/op/builtin.h
@@ -48,6 +48,8 @@ static constexpr const char *kEnablePTXASVerboseOutput =
 static constexpr const char *kDisableVectorize256 = "tl.disable_vectorize_256";
 static constexpr const char *kDisableWGMMA = "tl.disable_wgmma";
 static constexpr const char *kDisableShuffleElect = "tl.disable_shuffle_elect";
+static constexpr const char *kStorageRewriteDetectInplace =
+    "tl.storage_rewrite_detect_inplace";
 /*!
  * \brief Whether to disable dynamic tail split
  *
diff --git a/src/transform/storage_rewrite.cc b/src/transform/storage_rewrite.cc
@@ -38,6 +38,7 @@
 #include <unordered_set>
 #include <utility>
 
+#include "../op/builtin.h"
 #include "arith/int_operator.h"
 #include "runtime/thread_storage_scope.h"
 #include "tir/ir/buffer_common.h"
@@ -1914,6 +1915,8 @@ using namespace tir::transform;
 namespace transform {
 Pass StorageRewrite() {
   auto pass_func = [](PrimFunc f, const IRModule &m, PassContext ctx) {
+    bool detect_inplace =
+        ctx->GetConfig<Bool>(kStorageRewriteDetectInplace, Bool(false)).value();
     bool enable_reuse = true;
     bool reuse_require_exact_matched_dtype = false;
     bool merge_static_smem =
@@ -1939,9 +1942,9 @@ Pass StorageRewrite() {
       reuse_require_exact_matched_dtype = true;
     }
     auto *n = f.CopyOnWrite();
-    n->body =
-        StoragePlanRewriter().Rewrite(std::move(n->body), true, enable_reuse,
-                                      reuse_require_exact_matched_dtype);
+    n->body = StoragePlanRewriter().Rewrite(std::move(n->body), detect_inplace,
+                                            enable_reuse,
+                                            reuse_require_exact_matched_dtype);
     // Parameters may not be rewritten, but internal allocations may.
     // Vectorization of AllocateConst is currently disabled, as it has
     // indexing issues for types that include padding (e.g. int8x3
diff --git a/testing/python/components/test_storage_rewrite_detect_inplace.py b/testing/python/components/test_storage_rewrite_detect_inplace.py
diff --git a/testing/python/fastmath/test_mathops_fastmath.py b/testing/python/fastmath/test_mathops_fastmath.py
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
diff --git a/tilelang/env.py b/tilelang/env.py
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
diff --git a/tilelang/utils/target.py b/tilelang/utils/target.py