tile-ai · LeiWang1999 · Oct 19, 2025 · Oct 19, 2025
diff --git a/benchmark/matmul/README.md b/benchmark/matmul/README.md
@@ -0,0 +1,36 @@
+# FP16 Matmul Benchmark (8192×8192)
+
+This document records the throughput achieved by `benchmark_matmul.py` when multiplying FP16 matrices sized `M = N = 8192` across different `K` dimensions using the default autotuning search space.
+
+## Environment
+
+- Repository commit: `17bd0a6c651f599bec1397e0b91830c3ddc93076`
+- GPUs: `NVIDIA H800 SXM` on driver `560.35.05`
+
+## How to Reproduce
+
+```bash
+cd benchmark/matmul
+python - <<'PY'
+from benchmark_matmul import matmul
+
+M = 8192
+N = 8192
+for K in [256, 512, 1024, 2048, 4096, 8192, 16384]:
+    res = matmul(M, N, K, False)
+    tflops = 2 * M * N * K / res.latency * 1e-12
+    print(f"K={K:5d}  latency={res.latency:.6f}s  TFlops={tflops:.3f}")
+PY
+```
+
+## Results
+
+| K     | Latency (s) | Throughput (TFLOPs) |
+|-------|-------------|---------------------|
+|   256 | 0.089056    | 386                 |
+|   512 | 0.132064    | 520                 |
+|  1024 | 0.218816    | 628                 |
+|  2048 | 0.390112    | 705                 |
+|  4096 | 0.746752    | 736                 |
+|  8192 | 1.449888    | 758                 |
+| 16384 | 2.871168    | 766                 |
diff --git a/benchmark/matmul/benchmark_matmul.py b/benchmark/matmul/benchmark_matmul.py
@@ -2,6 +2,7 @@
 import itertools
 import logging
 
+import tilelang
 import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
@@ -187,6 +188,8 @@ def main(
 
             # Enable (or disable) swizzling optimization
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
+            # to utilize swizzle tma layout
+            T.annotate_layout({C_shared: tilelang.layout.make_swizzled_layout(C_shared)})
 
             # Clear out the accumulation buffer
             T.clear(C_local)

diff --git a/benchmark/matmul_fp8/README.md b/benchmark/matmul_fp8/README.md
@@ -27,10 +27,10 @@ PY
 
 | K     | Latency (s) | Throughput (TFLOPs) |
 |-------|-------------|---------------------|
-|   256 | 0.091488    | 376                 |
-|   512 | 0.110496    | 622                 |
-|  1024 | 0.148256    | 927                 |
-|  2048 | 0.234080    | 1174                |
-|  4096 | 0.398944    | 1378                |
-|  8192 | 0.752416    | 1461                |
-| 16384 | 1.443808    | 1523                |
+|   256 | 0.060352    | 569                 |
+|   512 | 0.080096    | 858                 |
+|  1024 | 0.121696    | 1129                |
+|  2048 | 0.204672    | 1343                |
+|  4096 | 0.374816    | 1467                |
+|  8192 | 0.729664    | 1507                |
+| 16384 | 1.427264    | 1541                |
diff --git a/benchmark/matmul_fp8/benchmark_matmul.py b/benchmark/matmul_fp8/benchmark_matmul.py
@@ -1,7 +1,7 @@
 import argparse
 import itertools
 import logging
-
+import tilelang
 import tilelang.language as T
 from tilelang.autotuner import autotune
 from tilelang import jit
@@ -190,6 +190,8 @@ def main(
 
             # Enable (or disable) swizzling optimization
             T.use_swizzle(panel_size=10, enable=enable_rasteration)
+            # to utilize swizzle tma layout
+            T.annotate_layout({C_shared: tilelang.layout.make_swizzled_layout(C_shared)})
 
             # Clear out the accumulation buffer
             T.clear(C_local)