Skip to content

Commit 32b0d41

Browse files
committed
Merge branch 'main' into gqa1020
2 parents 14deea8 + cdc67fc commit 32b0d41

37 files changed

+287
-95
lines changed

CMakeLists.txt

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -158,22 +158,26 @@ endif()
158158
find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
159159

160160
add_custom_command(
161-
OUTPUT "${CMAKE_BINARY_DIR}/cython_wrapper.cpp"
161+
OUTPUT "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp"
162162
COMMENT
163163
"Cythoning tilelang/jit/adapter/cython/cython_wrapper.pyx"
164164
COMMAND Python::Interpreter -m cython
165165
"${CMAKE_CURRENT_SOURCE_DIR}/tilelang/jit/adapter/cython/cython_wrapper.pyx"
166-
--cplus --output-file "${CMAKE_BINARY_DIR}/cython_wrapper.cpp"
166+
--module-name tilelang_cython_wrapper
167+
--cplus --output-file "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp"
167168
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/tilelang/jit/adapter/cython/cython_wrapper.pyx"
168169
VERBATIM)
169170

170171
if(NOT "${SKBUILD_SABI_VERSION}" STREQUAL "")
171172
set(USE_SABI USE_SABI ${SKBUILD_SABI_VERSION})
172173
endif()
173174

174-
python_add_library(cython_wrapper MODULE "${CMAKE_BINARY_DIR}/cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
175-
# Install to site dir to support direct import
176-
install(TARGETS cython_wrapper LIBRARY DESTINATION .)
175+
python_add_library(tilelang_cython_wrapper MODULE "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
176+
# Install extension into the tilelang package directory
177+
install(TARGETS tilelang_cython_wrapper
178+
LIBRARY DESTINATION tilelang
179+
RUNTIME DESTINATION tilelang
180+
ARCHIVE DESTINATION tilelang)
177181

178182
# let libtilelang to search tvm/tvm_runtime in same dir
179183
if(APPLE)

docs/get_started/targets.md

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# Understanding Targets
2+
3+
TileLang is built on top of TVM, which relies on **targets** to describe the device you want to compile for.
4+
The target determines which code generator is used (CUDA, HIP, Metal, LLVM, …) and allows you to pass
5+
device-specific options such as GPU architecture flags. This page summarises how to pick and customise a target
6+
when compiling TileLang programs.
7+
8+
## Common target strings
9+
10+
TileLang ships with a small set of common targets; each accepts the full range of TVM options so you can fine-tune
11+
the generated code. The most frequent choices are listed below:
12+
13+
| Base name | Description |
14+
| --------- | ----------- |
15+
| `auto` | Detects CUDA → HIP → Metal in that order. Useful when running the same script across machines. |
16+
| `cuda` | NVIDIA GPUs. Supports options such as `-arch=sm_80`, `-max_num_threads=1024`, etc. |
17+
| `hip` | AMD GPUs via ROCm. Options like `-mcpu=gfx90a` can be appended. |
18+
| `metal` | Apple Silicon GPUs (arm64 Macs). |
19+
| `llvm` | CPU execution; accepts the standard TVM LLVM switches. |
20+
| `webgpu` | Browser / WebGPU runtimes. |
21+
| `c` | Emit plain C source for inspection or custom toolchains. |
22+
23+
To add options, append them after the base name, separated by spaces. For example:
24+
25+
```python
26+
target = "cuda -arch=sm_90"
27+
kernel = tilelang.compile(func, target=target, execution_backend="cython")
28+
# or
29+
@tilelang.jit(target=target)
30+
def compiled_kernel(*args):
31+
return func(*args)
32+
```
33+
34+
The same convention works for HIP or LLVM (e.g. `hip -mcpu=gfx940`, `llvm -mtriple=x86_64-linux-gnu`).
35+
36+
### Advanced: Specify Exact Hardware
37+
38+
When you already know the precise GPU model, you can encode it in the target string—either via `-arch=sm_XX` or by
39+
using one of TVM’s pre-defined target tags such as `nvidia/nvidia-h100`. Supplying this detail is optional for
40+
TileLang in general use, but it becomes valuable when the TVM cost model is enabled (e.g. during autotuning). The
41+
cost model uses the extra attributes to make better scheduling predictions. If you skip this step (or do not use the
42+
cost model), generic targets like `cuda` or `auto` are perfectly fine.
43+
44+
All CUDA compute capabilities recognised by TVM’s target registry are listed below. Pick the one that matches your
45+
GPU and append it to the target string or use the corresponding target tag—for example `nvidia/nvidia-a100`.
46+
47+
| Architecture | GPUs (examples) |
48+
| ------------ | ---------------- |
49+
| `sm_20` | `nvidia/tesla-c2050`, `nvidia/tesla-c2070` |
50+
| `sm_21` | `nvidia/nvs-5400m`, `nvidia/geforce-gt-520` |
51+
| `sm_30` | `nvidia/quadro-k5000`, `nvidia/geforce-gtx-780m` |
52+
| `sm_35` | `nvidia/tesla-k40`, `nvidia/quadro-k6000` |
53+
| `sm_37` | `nvidia/tesla-k80` |
54+
| `sm_50` | `nvidia/quadro-k2200`, `nvidia/geforce-gtx-950m` |
55+
| `sm_52` | `nvidia/tesla-m40`, `nvidia/geforce-gtx-980` |
56+
| `sm_53` | `nvidia/jetson-tx1`, `nvidia/jetson-nano` |
57+
| `sm_60` | `nvidia/tesla-p100`, `nvidia/quadro-gp100` |
58+
| `sm_61` | `nvidia/tesla-p4`, `nvidia/quadro-p6000`, `nvidia/geforce-gtx-1080` |
59+
| `sm_62` | `nvidia/jetson-tx2` |
60+
| `sm_70` | `nvidia/nvidia-v100`, `nvidia/quadro-gv100` |
61+
| `sm_72` | `nvidia/jetson-agx-xavier` |
62+
| `sm_75` | `nvidia/nvidia-t4`, `nvidia/quadro-rtx-8000`, `nvidia/geforce-rtx-2080` |
63+
| `sm_80` | `nvidia/nvidia-a100`, `nvidia/nvidia-a30` |
64+
| `sm_86` | `nvidia/nvidia-a40`, `nvidia/nvidia-a10`, `nvidia/geforce-rtx-3090` |
65+
| `sm_87` | `nvidia/jetson-agx-orin-32gb`, `nvidia/jetson-agx-orin-64gb` |
66+
| `sm_89` | `nvidia/geforce-rtx-4090` |
67+
| `sm_90a` | `nvidia/nvidia-h100` (DPX profile) |
68+
| `sm_100a` | `nvidia/nvidia-b100` |
69+
70+
Refer to NVIDIA’s [CUDA GPUs](https://developer.nvidia.com/cuda-gpus) page or the TVM source
71+
(`3rdparty/tvm/src/target/tag.cc`) for the latest mapping between devices and compute capabilities.
72+
73+
## Creating targets programmatically
74+
75+
If you prefer working with TVM’s `Target` objects, TileLang exposes the helper
76+
`tilelang.utils.target.determine_target` (returns a canonical target string by default, or the `Target`
77+
object when `return_object=True`):
78+
79+
```python
80+
from tilelang.utils.target import determine_target
81+
82+
tvm_target = determine_target("cuda -arch=sm_80", return_object=True)
83+
kernel = tilelang.compile(func, target=tvm_target)
84+
```
85+
86+
You can also build targets directly through TVM:
87+
88+
```python
89+
from tvm.target import Target
90+
91+
target = Target("cuda", host="llvm")
92+
target = target.with_host(Target("llvm -mcpu=skylake"))
93+
```
94+
95+
TileLang accepts either `str` or `Target` inputs; internally they are normalised and cached using the canonical
96+
string representation. **In user code we strongly recommend passing target strings rather than
97+
`tvm.target.Target` instances—strings keep cache keys compact and deterministic across runs, whereas constructing
98+
fresh `Target` objects may lead to slightly higher hashing overhead or inconsistent identity semantics.**
99+
100+
## Discovering supported targets in code
101+
102+
Looking for a quick reminder of the built-in base names and their descriptions? Use:
103+
104+
```python
105+
from tilelang.utils.target import describe_supported_targets
106+
107+
for name, doc in describe_supported_targets().items():
108+
print(f"{name:>6}: {doc}")
109+
```
110+
111+
This helper mirrors the table above and is safe to call at runtime (for example when validating CLI arguments).
112+
113+
## Troubleshooting tips
114+
115+
- If you see `Target cuda -arch=sm_80 is not supported`, double-check the spellings and that the option is valid for
116+
TVM. Any invalid switch will surface as a target-construction error.
117+
- Runtime errors such as “no kernel image is available” usually mean the `-arch` flag does not match the GPU you are
118+
running on. Try dropping the flag or switching to the correct compute capability.
119+
- When targeting multiple environments, use `auto` for convenience and override with an explicit string only when
120+
you need architecture-specific tuning.

docs/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ low-level optimizations necessary for state-of-the-art performance.
1414

1515
get_started/Installation
1616
get_started/overview
17+
get_started/targets
1718
:::
1819

1920

examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111

1212
from heuristic import num_splits_heuristic
1313

14-
tilelang.disable_cache()
15-
1614

1715
def flashattn(batch, heads, heads_kv, dim, dim_v):
1816
scale = (1.0 / dim)**0.5 * 1.44269504 # log2(e)

examples/cast/example_per_token_cast_to_fp8.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
from typing import Tuple
55
from tilelang.utils.tensor import torch_assert_close
66

7-
tilelang.disable_cache()
8-
97

108
@tilelang.jit(out_idx=[1, 2])
119
def per_token_cast_to_fp8(M, N, blk_m):

examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
from einops import rearrange, einsum
66
import argparse
77

8-
tilelang.disable_cache()
9-
108

119
def get_configs():
1210
import itertools

examples/elementwise/example_elementwise_add.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
import tilelang.language as T
66
from tilelang.autotuner import AutoTuner
77

8-
tilelang.disable_cache()
9-
108

119
def ref_program(x, y):
1210
return x + y

examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from einops import rearrange, repeat
88
from bert_padding import pad_input, unpad_input
99

10-
# tilelang.disable_cache()
1110
torch.manual_seed(1)
1211

1312

examples/gdn/example_chunk_delta_bwd.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@
2424
torch.random.manual_seed(0)
2525
# torch.set_printoptions(profile="full")
2626

27-
tilelang.disable_cache()
28-
2927
from utils import *
3028

3129

examples/gdn/example_chunk_delta_h.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@
3232

3333
torch.random.manual_seed(0)
3434

35-
tilelang.disable_cache()
36-
3735

3836
def prepare_input(
3937
B,

0 commit comments

Comments
 (0)