Skip to content

Commit 1fc01a5

Browse files
[FFI] Rebase tvm to v0.22.0 to utilize tvm-ffi (tile-ai#1108)
* 3rdparty tvm bump * bump tvm into v0.22.0 * lint fix * rebase tvm * Update submodule tvm to latest commit 3085bc4 * Refactor: Update configuration retrieval in CopyNode and adjust test registration in tilelang * test fix * add requirement * atomic_fix * atomic_fix * phaseout py39 * optimize * optimize * lint fix * do not clean cache * do not clean cache * [Minor] Minor update for Python versions and dependencies * [Lint] fix lint for py39 * [Lint] fix lint for ROCm * [Build][CI] Sync CI changes from upstream/sdist * [Lint] fix lint for ROCm * [Build][CI] Update `repair-wheel-command` * [Minor] update abi3audit result format * [Lint] fix lint for ROCm * [BugFix] fix build * [Lint] fix lint for ROCm * [BugFix] set rpath for libtvm and libtvm_runtime * [Deps] pin apache-tvm-ffi version * [Build] set Python 3.9 Limited API for Cython target * [Build] set Python 3.9 Limited API for Cython target * [Deps] Restore Python 3.8 support * [Build] use `apache-tvm-ffi`'s `libtvm_ffi` * [BugFix] use `;` as delimiter for RPATH on macOS * [BugFix] use `--ignore-missing-dependencies` for `delocate-wheel` * [Build] support `sccache` if available * [Build] add CIBW import test * [Build][CI] enable ccache for CIBW on Linux * [BugFix] set rpath for libtvm and libtvm_runtime * Revert "[Build][CI] enable ccache for CIBW on Linux" This reverts commit cd9ab57. * [CI] fix perfbench bot * [BugFix] use Python 3.9 to build wheel * [Minor] update perfbench bot envs * [BugFix] fix CIBW environment on Linux * [CI] skip import test on CentOS 7 * [CI] use Python urllib to download file instead of Wget --------- Co-authored-by: Xuehai Pan <XuehaiPan@pku.edu.cn>
1 parent a9e48cd commit 1fc01a5

File tree

126 files changed

+774
-1793
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+774
-1793
lines changed

.clang-tidy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
InheritParentConfig: true
3-
ExtraArgs: ['-v']
3+
ExtraArgs: []
44
FormatStyle: file
55
UseColor: true
66
WarningsAsErrors: '*'

.github/workflows/ci.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,12 @@ env:
2222
PYTHONDEVMODE: "1"
2323
PYTHONUNBUFFERED: "1"
2424
PYTHONPATH: "" # explicit cleanup
25+
PIP_USER: "" # explicit cleanup
2526
COLUMNS: "100"
2627
FORCE_COLOR: "1"
2728
CLICOLOR_FORCE: "1"
2829
UV_INDEX_STRATEGY: "unsafe-best-match"
30+
UV_HTTP_TIMEOUT: "600"
2931
XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
3032
PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
3133
UV_CACHE_DIR: "${{ github.workspace }}/.cache/uv" # to be updated
@@ -44,15 +46,15 @@ jobs:
4446
submodules: recursive
4547

4648
- name: Setup Python 3.8
47-
id: setup-py38
49+
id: setup-pylowest
4850
uses: actions/setup-python@v6
4951
with:
5052
python-version: "3.8" # use lowest supported version for linting
5153
update-environment: false
5254

5355
- name: Check AST with Python 3.8
5456
run: |
55-
"${{ steps.setup-py38.outputs.python-path }}" -m compileall -q -f tilelang
57+
"${{ steps.setup-pylowest.outputs.python-path }}" -m compileall -q -f tilelang
5658
5759
- name: Setup Python 3.12
5860
uses: actions/setup-python@v6

.github/workflows/dist.yml

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,14 +108,11 @@ jobs:
108108
- { runner: ubuntu-24.04-arm, toolkit: "CUDA-12.8" }
109109
- { runner: macos-latest, toolkit: "Metal" }
110110
python-version:
111-
- "3.8"
112-
# TVM is built with Python 3.8 Limited API, it should work with all Python >= 3.8.
113-
# - "3.9"
114-
# - "3.10"
115-
# - "3.11"
116-
# - "3.12"
117-
# - "3.13"
118-
# - "3.14"
111+
# Wheels are built with Python 3.8 Limited API, they should work with all Python >= 3.8.
112+
# Only build wheels against Python 3.8 Limited API to save CI resources.
113+
# FIXME: Here we use Python 3.9 because our dependency `apache-tvm-ffi` claims to support
114+
# Python 3.8 but it depends on a version of `ml-dtypes` that requires Python >= 3.9.
115+
- "3.9"
119116
fail-fast: false
120117
timeout-minutes: 120
121118
runs-on: ${{ matrix.target.runner }}

.github/workflows/pr-perfbench-bot.yml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,17 @@ concurrency:
1212
group: "${{ github.workflow }}-${{ github.ref }}"
1313
cancel-in-progress: true # always cancel in-progress
1414

15+
env:
16+
PYTHONDEVMODE: "1"
17+
PYTHONUNBUFFERED: "1"
18+
PYTHONPATH: "" # explicit cleanup
19+
PIP_USER: "" # explicit cleanup
20+
COLUMNS: "100"
21+
FORCE_COLOR: "1"
22+
CLICOLOR_FORCE: "1"
23+
XDG_CACHE_HOME: "${{ github.workspace }}/.cache" # to be updated
24+
PIP_CACHE_DIR: "${{ github.workspace }}/.cache/pip" # to be updated
25+
1526
jobs:
1627
perfbench:
1728
name: Benchmark between PR and main
@@ -31,7 +42,12 @@ jobs:
3142
- name: Setup Python
3243
uses: actions/setup-python@v6
3344
with:
34-
python-version: "3.9"
45+
python-version: "3.12"
46+
update-environment: true
47+
cache: pip
48+
cache-dependency-path: |
49+
pyproject.toml
50+
requirements*.txt
3551
3652
- name: Install merged version
3753
run: |

3rdparty/tvm

Submodule tvm updated from 5bf17a3 to 0f1ebab

CMakeLists.txt

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ set(CMAKE_CXX_STANDARD 17)
88
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
99
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
1010

11+
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND "$ENV{CIBUILDWHEEL}")
12+
# Warning came from tvm submodule
13+
string(APPEND CMAKE_CXX_FLAGS " -Wno-dangling-reference")
14+
endif()
15+
1116
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
1217

1318
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.gitmodules" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
@@ -36,9 +41,18 @@ endif()
3641

3742
find_program(CCACHE_PROGRAM ccache)
3843
if(CCACHE_PROGRAM)
44+
message(STATUS "Using ccache: ${CCACHE_PROGRAM}")
3945
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "C compiler launcher")
4046
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CXX compiler launcher")
4147
set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE_PROGRAM}" CACHE STRING "CUDA compiler launcher")
48+
else()
49+
find_program(SCCACHE_PROGRAM sccache)
50+
if(SCCACHE_PROGRAM)
51+
message(STATUS "Using sccache: ${SCCACHE_PROGRAM}")
52+
set(CMAKE_C_COMPILER_LAUNCHER "${SCCACHE_PROGRAM}" CACHE STRING "C compiler launcher")
53+
set(CMAKE_CXX_COMPILER_LAUNCHER "${SCCACHE_PROGRAM}" CACHE STRING "CXX compiler launcher")
54+
set(CMAKE_CUDA_COMPILER_LAUNCHER "${SCCACHE_PROGRAM}" CACHE STRING "CUDA compiler launcher")
55+
endif()
4256
endif()
4357

4458
# Configs
@@ -68,8 +82,6 @@ file(GLOB TILE_LANG_SRCS
6882
src/target/utils.cc
6983
src/target/codegen_cpp.cc
7084
src/target/rt_mod_cpp.cc
71-
# webgpu doesn't have system dependency
72-
src/target/codegen_webgpu.cc
7385
# intrin_rule doesn't have system dependency
7486
src/target/intrin_rule*.cc
7587
)
@@ -181,18 +193,18 @@ install(TARGETS tilelang_cython_wrapper
181193

182194
# let libtilelang to search tvm/tvm_runtime in same dir
183195
if(APPLE)
184-
set_target_properties(tilelang PROPERTIES INSTALL_RPATH "@loader_path")
185-
set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "@loader_path")
186-
else()
187-
set_target_properties(tilelang PROPERTIES INSTALL_RPATH "\$ORIGIN")
188-
set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "\$ORIGIN")
196+
set_target_properties(tilelang PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
197+
set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
198+
set_target_properties(tvm PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
199+
set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
200+
elseif(UNIX)
201+
set_target_properties(tilelang PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
202+
set_target_properties(tilelang_module PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
203+
set_target_properties(tvm PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
204+
set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
189205
endif()
190206

191-
install(TARGETS tvm tvm_runtime tilelang_module tilelang LIBRARY DESTINATION tilelang/lib)
192-
193-
# Copy tvm cython ext for wheels
194-
# TODO: not necessary for editable builds
195-
if(TVM_BUILD_FROM_SOURCE)
196-
add_dependencies(tilelang tvm_cython)
197-
install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/tvm/python/tvm/ffi/core.abi3.so" DESTINATION tilelang/3rdparty/tvm/python/tvm/ffi/)
198-
endif()
207+
install(
208+
TARGETS tvm tvm_runtime tilelang_module tilelang
209+
LIBRARY DESTINATION tilelang/lib
210+
)

cmake/load_tvm.cmake

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,17 @@ endif()
1111

1212
set(TVM_INCLUDES
1313
${TVM_SOURCE}/include
14-
${TVM_SOURCE}/ffi/include
1514
${TVM_SOURCE}/src
1615
${TVM_SOURCE}/3rdparty/dlpack/include
1716
${TVM_SOURCE}/3rdparty/dmlc-core/include
1817
)
18+
19+
if(EXISTS ${TVM_SOURCE}/ffi/include)
20+
list(APPEND TVM_INCLUDES ${TVM_SOURCE}/ffi/include)
21+
elseif(EXISTS ${TVM_SOURCE}/3rdparty/tvm-ffi/include)
22+
list(APPEND TVM_INCLUDES ${TVM_SOURCE}/3rdparty/tvm-ffi/include)
23+
endif()
24+
25+
if(EXISTS ${TVM_SOURCE}/3rdparty/tvm-ffi/3rdparty/dlpack/include)
26+
list(APPEND TVM_INCLUDES ${TVM_SOURCE}/3rdparty/tvm-ffi/3rdparty/dlpack/include)
27+
endif()

examples/gemm/README.md

Lines changed: 37 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,34 @@ TileLang is a domain-specific language designed to simplify the process of writi
44

55
## Table of Contents
66

7-
1. [Getting Started](#getting-started)
8-
2. [Simple GEMM Example](#simple-gemm-example)
9-
- [Code Walkthrough](#code-walkthrough)
10-
- [Compiling and Profiling](#compiling-and-profiling)
11-
3. [Advanced GEMM Features](#advanced-gemm-features)
12-
- [Custom Memory Layout / Swizzling](#custom-memory-layout--swizzling)
13-
- [Parallel Copy and Auto-Pipelining](#parallel-copy-and-auto-pipelining)
14-
- [Rasterization for L2 Cache Locality](#rasterization-for-l2-cache-locality)
15-
4. [Enhanced GEMM Example with Annotations](#enhanced-gemm-example-with-annotations)
16-
5. [Verifying Correctness](#verifying-correctness)
17-
6. [Fine-grained MMA Computations](#fine-grained-mma-computations)
18-
- [Example Workflow](#example-workflow)
19-
- [Summary](#summary)
20-
7. [References](#references)
7+
- [Table of Contents](#table-of-contents)
8+
- [Getting Started](#getting-started)
9+
- [Prerequisites](#prerequisites)
10+
- [Installation](#installation)
11+
- [Simple GEMM Example](#simple-gemm-example)
12+
- [Code Walkthrough](#code-walkthrough)
13+
- [Compiling and Profiling](#compiling-and-profiling)
14+
- [Advanced GEMM Features](#advanced-gemm-features)
15+
- [Custom Memory Layout / Swizzling](#custom-memory-layout--swizzling)
16+
- [Parallel Copy and Auto-Pipelining](#parallel-copy-and-auto-pipelining)
17+
- [Rasterization for L2 Cache Locality](#rasterization-for-l2-cache-locality)
18+
- [Enhanced GEMM Example with Annotations](#enhanced-gemm-example-with-annotations)
19+
- [Verifying Correctness](#verifying-correctness)
20+
- [Fine-grained MMA Computations](#fine-grained-mma-computations)
21+
- [Example Workflow](#example-workflow)
22+
- [Summary](#summary)
23+
- [References](#references)
2124

2225
---
2326

2427
## Getting Started
2528

2629
### Prerequisites
2730

28-
- **Python 3.8+**
29-
- **NVIDIA GPU** with a recent CUDA toolkit installed
31+
- **Python 3.8+**
32+
- **NVIDIA GPU** with a recent CUDA toolkit installed
3033
- **PyTorch** (optional, for easy correctness verification)
31-
- **tilelang**
34+
- **tilelang**
3235
- **bitblas** (optional; used for swizzle layout utilities in the advanced examples)
3336

3437
### Installation
@@ -87,34 +90,34 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
8790

8891
### Code Walkthrough
8992

90-
1. **Define the Kernel Launch Configuration:**
93+
1. **Define the Kernel Launch Configuration:**
9194
```python
9295
with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
9396
```
9497
This creates a grid of blocks (ceildiv(N, block_N) in x-dimension, ceildiv(M, block_M) in y-dimension), each with 128 threads.
9598

96-
2. **Shared Memory Allocation:**
99+
2. **Shared Memory Allocation:**
97100
```python
98101
A_shared = T.alloc_shared((block_M, block_K), dtype)
99102
B_shared = T.alloc_shared((block_K, block_N), dtype)
100103
```
101104
Tiles of \(A\) and \(B\) are loaded into these shared memory buffers for faster access.
102105

103-
3. **Local Fragment Accumulation:**
106+
3. **Local Fragment Accumulation:**
104107
```python
105108
C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
106109
```
107110
Partial results are stored in registers (or local memory) to reduce writes to global memory.
108111

109-
4. **Pipelined Loading and GEMM:**
112+
4. **Pipelined Loading and GEMM:**
110113
```python
111114
for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
112115
T.copy(...)
113116
T.gemm(...)
114117
```
115118
Loads blocks of \(A\) and \(B\) in a pipelined fashion (up to 3 stages). This exploits overlap of data transfer and computation.
116119

117-
5. **Copy Out the Results:**
120+
5. **Copy Out the Results:**
118121
```python
119122
T.copy(C_local, C[by * block_M, bx * block_N])
120123
```
@@ -216,10 +219,10 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
216219
return main
217220
```
218221

219-
**Key Differences vs. Basic Example**
220-
1. **`T.annotate_layout(...)`**: Annotates how data should be organized in shared memory (swizzling).
221-
2. **`T.use_swizzle(...)`**: Enables swizzle-based rasterization.
222-
3. **Parallel Copy Loop** with `T.Parallel(...)`: Distributes global-to-shared copy across all threads, potentially vectorizing load/store instructions.
222+
**Key Differences vs. Basic Example**
223+
1. **`T.annotate_layout(...)`**: Annotates how data should be organized in shared memory (swizzling).
224+
2. **`T.use_swizzle(...)`**: Enables swizzle-based rasterization.
225+
3. **Parallel Copy Loop** with `T.Parallel(...)`: Distributes global-to-shared copy across all threads, potentially vectorizing load/store instructions.
223226

224227
---
225228

@@ -247,7 +250,7 @@ print("Results match!")
247250

248251
## Fine-grained MMA Computations
249252

250-
For advanced users who require full control over warp-level matrix multiplication operations, TileLang allows you to specify fine-grained MMA (Matrix Multiply-Accumulate) computations in a manner similar to writing raw CUDA. While higher-level abstractions like `T.gemm(...)` or automatic MMA emitters are sufficient for many use cases, specialized workloads (for example, dequantize gemm may require fine-grained layout transformation on shared to register stage) may benefit from explicitly controlling each MMA instruction, the data layout, and the synchronization points.
253+
For advanced users who require full control over warp-level matrix multiplication operations, TileLang allows you to specify fine-grained MMA (Matrix Multiply-Accumulate) computations in a manner similar to writing raw CUDA. While higher-level abstractions like `T.gemm(...)` or automatic MMA emitters are sufficient for many use cases, specialized workloads (for example, dequantize gemm may require fine-grained layout transformation on shared to register stage) may benefit from explicitly controlling each MMA instruction, the data layout, and the synchronization points.
251254

252255
### Example Workflow
253256

@@ -394,10 +397,10 @@ def tl_matmul(
394397
]
395398
```
396399

397-
1. **Set Up Tile Sizes and Thread Bindings**
400+
1. **Set Up Tile Sizes and Thread Bindings**
398401
Just like in CUDA, you will typically start by defining how many warps or threads per block you want and how your matrix is subdivided. In TileLang, this is done via `T.Kernel(...)` and `T.thread_binding(...),` which ensure that the correct number of threads are active, and each thread is bound to a specific role (e.g., warp ID or lane ID).
399402

400-
2. **Allocate Warp-local Fragments**
403+
2. **Allocate Warp-local Fragments**
401404
Instead of using a single shared buffer for partial sums, you allocate local buffers (register fragments) to hold sub-blocks of matrices \(A\) and \(B\). In TileLang, this is done with something like:
402405
```python
403406
A_local = T.alloc_local((warp_rows * local_size_a), in_dtype)
@@ -406,7 +409,7 @@ def tl_matmul(
406409
```
407410
Each of these `local` allocations represents a region of per-thread storage, which collectively forms the warp’s register tiles.
408411

409-
3. **Load Data via `ldmatrix`**
412+
3. **Load Data via `ldmatrix`**
410413
Fine-grained loading instructions allow you to specify exactly how data moves from shared memory to the warp-level fragments. In the example below, `mma_emitter.ldmatrix_a()` and `.ldmatrix_b()` are higher-level wrappers around warp-synchronous intrinsics. You can write your own load logic as well:
411414
```python
412415
for ki in T.serial(0, (block_K // micro_size_k)):
@@ -418,7 +421,7 @@ def tl_matmul(
418421
```
419422
Internally, these calls orchestrate how each thread in the warp issues the correct load instructions, performs address calculations, and stores the data into registers.
420423

421-
4. **Perform the MMA Instruction**
424+
4. **Perform the MMA Instruction**
422425
After loading sub-tiles (fragments), the warp executes the `mma` instruction. This operation is essentially:
423426
\[
424427
C_{\text{local}} \;+=\; A_{\text{local}} \;\times\; B_{\text{local}}
@@ -429,7 +432,7 @@ def tl_matmul(
429432
```
430433
Under the hood, this translates into Tensor Core instructions (e.g., `wmma.mma.sync` in PTX), which process multiple data elements per warp in parallel.
431434

432-
5. **Store Results via `stmatrix`**
435+
5. **Store Results via `stmatrix`**
433436
Finally, you write the results from the warp-level fragments back to shared memory or global memory. This step might happen multiple times in a loop or just once at the end. The code snippet:
434437
```python
435438
mma_emitter.stmatrix(C_local, C_shared)
@@ -444,6 +447,6 @@ By combining warp-synchronous intrinsics (`ldmatrix`, `mma`, `stmatrix`) with ma
444447

445448
## References
446449

447-
- [NVIDIA CUTLASS Library](https://github.com/NVIDIA/cutlass): A collection of high-performance CUDA C++ template abstractions for GEMM.
448-
- [NVIDIA CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html): Official documentation for CUDA.
450+
- [NVIDIA CUTLASS Library](https://github.com/NVIDIA/cutlass): A collection of high-performance CUDA C++ template abstractions for GEMM.
451+
- [NVIDIA CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html): Official documentation for CUDA.
449452
- [PyTorch Documentation](https://pytorch.org/docs): For verifying correctness via CPU or GPU-based matmul.

format.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ elif [[ "${#FILES[@]}" -gt 0 ]]; then
8080
echo "Checking specified files: ${FILES[*]}..." >&2
8181
fi
8282

83+
# Some systems set pip's default to --user, which breaks isolated virtualenvs.
84+
export PIP_USER=0
85+
8386
# If pre-commit is not installed, install it.
8487
if ! python3 -m pre_commit --version &>/dev/null; then
8588
python3 -m pip install pre-commit

0 commit comments

Comments
 (0)