tile-ai
diff --git a/‎.github/workflows/ci.yml
Lines changed: 70 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 70 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 3 deletions b/‎README.md
Lines changed: 1 addition & 3 deletions
diff --git a/‎docs/Installation.md
Lines changed: 1 addition & 1 deletion b/‎docs/Installation.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
Lines changed: 68 additions & 41 deletions b/‎examples/dequantize_gemm/example_dequant_gemm_fp4_hopper.py
Lines changed: 68 additions & 41 deletions
diff --git a/‎examples/gemm/example_gemm.py
Lines changed: 6 additions & 9 deletions b/‎examples/gemm/example_gemm.py
Lines changed: 6 additions & 9 deletions
@@ -0,0 +1,70 @@
+name: CI
+
+on: [pull_request]
+
+jobs:
+  format-check:
+    runs-on: self-hosted
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+
+    - name: Create virtual environment
+      run: python -m venv bitblas_ci
+
+    - name: Activate virtual environment and install dependencies
+      run: |
+        source bitblas_ci/bin/activate
+        python -m pip install --upgrade pip
+        if [ -f requirements-dev.txt ]; then python -m pip install -r requirements-dev.txt; fi
+
+    - name: Update submodules recursively
+      run: git submodule update --init --recursive
+
+    - name: Run format check
+      run: |
+        source bitblas_ci/bin/activate
+        ./format.sh
+
+  build-test:
+    runs-on: self-hosted
+    needs: format-check
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+
+    - name: Create virtual environment
+      run: python -m venv bitblas_ci
+
+    - name: Activate virtual environment and install dependencies
+      run: |
+        source bitblas_ci/bin/activate
+        python -m pip install --upgrade pip
+        if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
+
+    - name: Install project in wheel mode
+      run: |
+        source bitblas_ci/bin/activate
+        python -m pip install .
+
+    - name: Run tests
+      run: |
+        source bitblas_ci/bin/activate
+        cd testing/python
+        python -m pytest
@@ -201,7 +201,7 @@ def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="flo
 
 In addition to GEMM, we provide a variety of examples to showcase the versatility and power of TileLang, including:
 
-- [Dequantize GEMM](./examples/dequantize_gemm/): Achieve high-performance dequantization by **fine-grained control over per-thread operations**, with many features now adopted as default behaviors in [BitBLAS](https://github.com/microsoft/BitBLAS), which utilzing magic layout transformation and intrins to accelerate dequantize gemm.
+- [Dequantize GEMM](./examples/dequantize_gemm/): Achieve high-performance dequantization by **fine-grained control over per-thread operations**, with many features now adopted as default behaviors in [BitBLAS](https://github.com/microsoft/BitBLAS), which utilizing magic layout transformation and intrins to accelerate dequantize gemm.
 - [FlashAttention](./examples/flash_attention/): Enable cross-operator fusion with simple and intuitive syntax, and we also provide an example of auto tuning.
 - [LinearAttention](./examples/linear_attention/): Examples include RetNet and Mamba implementations.
 - [Convolution](./examples/convolution/): Implementations of Convolution with IM2Col.
@@ -227,5 +227,3 @@ This project may contain trademarks or logos for projects, products, or services
 ## Acknowledgements
 
 We learned a lot from the [TVM](https://github.com/apache/tvm) community and would like to thank them for their contributions.
-
-This project was initiated by [yining shi](https://github.com/nox-410), and continued by [lei wang](https://github.com/LeiWang1999) and [yu cheng](https://github.com/chengyupku). It was completed under the guidance of [yuqing xia](https://github.com/xiayuqing0622), [lingxiao ma](https://github.com/xysmlx) and [jilong xue](https://github.com/jlxue) from [MSRA System Research Group](https://www.microsoft.com/en-us/research/group/systems-and-networking-research-group-asia/).
@@ -7,7 +7,7 @@
 - **Python Version**: >= 3.8
 - **CUDA Version**: >= 11.0
 
-The easiest way to install TileLang is direcly from the PyPi using pip. To install the latest version, run the following command in your terminal.
+The easiest way to install TileLang is directly from the PyPi using pip. To install the latest version, run the following command in your terminal.
 
 **Note**: Currently, TileLang whl is only supported on Ubuntu 20.04 or later version as we build the whl files on this platform. Currently we only provide whl files for CUDA>=11.0 and with Python>=3.8. **If you are using a different platform or environment, you may need to [build TileLang from source](https://github.com/microsoft/TileLang/blob/main/docs/Installation.md#building-from-source).**
 
 
@@ -12,6 +12,7 @@
 import argparse
 from functools import partial
 
+
 def _tir_u8_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype: str):
     assert nbit == 4
     assert dtype == "float16"
@@ -24,12 +25,15 @@ def _tir_u8_to_f4_to_f16(nbit: int, val: tir.PrimExpr, pos: tir.PrimExpr, dtype:
     s = f4 >> tir.const(3, "uint16")
     e_f4 = f4 & tir.const(7, "uint16")
     e_f16 = e_f4 | tir.const(8, "uint16")
-    val_f16 = tir.reinterpret("float16",
-                              ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")).astype("uint16"))
+    val_f16 = tir.reinterpret(
+        "float16",
+        ((e_f16 | (s << tir.const(5, "uint16"))) << tir.const(10, "uint16")).astype("uint16"))
     # return tir.Select(e_f4 == tir.const(0, "uint32"), tir.const(0, "float16"), val_f16)
     return val_f16
 
+
 def torch_convert(tensor):
+
     def print_bit(name, val):
         val_cpu = val.cpu().item()
         binary_repr = f'{val_cpu:032b}'
@@ -46,7 +50,7 @@ def _convert(val, pos):
         val_f16 = ((e_f16 | (s << 5)) << 10) & 0xFFFF
         lower_16_bits = (val_f16 & 0xFFFF).to(torch.uint16)
         return lower_16_bits.view(torch.float16)
-    
+
     N = tensor.shape[0]
     K = tensor.shape[1]
     new_tensor = torch.empty(N, K * 2, dtype=torch.float16, device=tensor.device)
@@ -55,6 +59,7 @@ def _convert(val, pos):
             new_tensor[i][j] = _convert(tensor[i][j // 2], j % 2)
     return new_tensor
 
+
 def test_convert(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
     num_elems_per_byte = 8 // num_bits
     storage_dtype = "uint8"
@@ -64,18 +69,15 @@ def test_convert(N, K, block_N, block_K, in_dtype, num_bits=4, threads=128):
 
     @T.prim_func
     def main(
-        B: T.Buffer(B_shape, storage_dtype),
-        C: T.Buffer((N, K), in_dtype),
+            B: T.Buffer(B_shape, storage_dtype),
+            C: T.Buffer((N, K), in_dtype),
     ):
         with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
             B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
             B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
             B_dequantize_local = T.alloc_fragment(B_dequantize_shared_shape, in_dtype)
 
-            for k in T.Pipelined(
-                T.ceildiv(K, block_K), 
-                num_stages=1
-            ):
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
                 T.copy(B[bx * block_N, k * block_K // num_elems_per_byte], B_shared)
                 T.copy(B_shared, B_local)
                 for i, j in T.Parallel(block_N, block_K):
@@ -89,12 +91,13 @@ def main(
 
     return main
 
+
 def test_fp4_fp16_convert_close():
     N, K = 256, 256
     block_N, block_K = 64, 64
     program = test_convert(
         N,
-        K, 
+        K,
         block_N,
         block_K,
         "float16",
@@ -109,6 +112,7 @@ def test_fp4_fp16_convert_close():
     assert torch.allclose(tl_out, ref_out, rtol=0.01, atol=0.01), (tl_out, ref_out)
     print("Pass")
 
+
 def get_configs():
     block_M = [128]
     block_N = [128, 256]
@@ -118,13 +122,19 @@ def get_configs():
     splits = [1]
     _configs = list(itertools.product(block_M, block_N, block_K, num_stages, threads, splits))
 
-    configs = [
-        {'block_M': c[0], 'block_N': c[1], 'block_K': c[2], 'num_stages': c[3], 'threads': c[4], 'split': c[5]}
-        for c in _configs
-    ]
+    configs = [{
+        'block_M': c[0],
+        'block_N': c[1],
+        'block_K': c[2],
+        'num_stages': c[3],
+        'threads': c[4],
+        'split': c[5]
+    } for c in _configs]
     return configs
 
+
 def matmul(M, N, K, in_dtype, out_dtype, accum_dtype, num_bits=4, tune=False):
+
     def kernel_func(block_M, block_N, block_K, num_stages, threads, split=1):
         num_elems_per_byte = 8 // num_bits
         storage_dtype = "uint8"
@@ -142,10 +152,13 @@ def main_split(
                 B: T.Buffer(B_shape, storage_dtype),
                 Ct: T.Buffer((N, M), out_dtype),
         ):
-            SplitC = T.alloc_buffer(
-                [split, (N + block_N - 1) // block_N * block_N, (M + block_M - 1) // block_M * block_M], out_dtype
-            )
-            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), split, threads=threads) as (bx, by, bz):
+            SplitC = T.alloc_buffer([
+                split, (N + block_N - 1) // block_N * block_N,
+                (M + block_M - 1) // block_M * block_M
+            ], out_dtype)
+            with T.Kernel(
+                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), split,
+                    threads=threads) as (bx, by, bz):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -154,12 +167,10 @@ def main_split(
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout(
-                    {
-                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                        Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                    }
-                )
+                T.annotate_layout({
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                })
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // (block_K * split), num_stages=num_stages):
@@ -175,22 +186,24 @@ def main_split(
                         )
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
-                T.copy(Ct_local, SplitC[bz, bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
+                T.copy(Ct_local, SplitC[bz, bx * block_N:(bx + 1) * block_N,
+                                        by * block_M:(by + 1) * block_M])
             with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (bx, by):
                 acc = T.alloc_fragment((block_N, block_M), out_dtype)
                 T.clear(acc)
                 for k in range(split):
                     for i, j in T.Parallel(block_N, block_M):
                         acc[i, j] += SplitC[k, bx * block_N + i, by * block_M + j]
                 T.copy(acc, Ct[bx * block_N, by * block_M])
-        
+
         @T.prim_func
         def main(
                 A: T.Buffer(A_shape, in_dtype),
                 B: T.Buffer(B_shape, storage_dtype),
                 Ct: T.Buffer((N, M), out_dtype),
         ):
-            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            with T.Kernel(
+                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
                 A_shared = T.alloc_shared(A_shared_shape, in_dtype)
                 B_shared = T.alloc_shared(B_shared_shape, storage_dtype)
                 B_local = T.alloc_fragment(B_shared_shape, storage_dtype)
@@ -199,12 +212,10 @@ def main(
                 Ct_local = T.alloc_fragment((block_N, block_M), accum_dtype)
                 Ct_shared = T.alloc_shared((block_N, block_M), out_dtype)
 
-                T.annotate_layout(
-                    {
-                        B_shared: tilelang.layout.make_swizzled_layout(B_shared),
-                        Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
-                    }
-                )
+                T.annotate_layout({
+                    B_shared: tilelang.layout.make_swizzled_layout(B_shared),
+                    Ct_shared: tilelang.layout.make_swizzled_layout(Ct_shared),
+                })
 
                 T.clear(Ct_local)
                 for k in T.Pipelined(K // block_K, num_stages=num_stages):
@@ -221,38 +232,51 @@ def main(
                     T.copy(B_dequantize_local, B_dequantize_prev_local)
                     T.gemm(B_dequantize_prev_local, A_shared, Ct_local, transpose_B=True)
                 T.copy(Ct_local, Ct_shared)
-                T.copy(Ct_shared, Ct[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M])
+                T.copy(Ct_shared, Ct[bx * block_N:(bx + 1) * block_N,
+                                     by * block_M:(by + 1) * block_M])
 
         if split == 1:
             return main
         else:
             return main_split
-    
+
     if tune:
+
         @autotune(
             configs=get_configs(),
             keys=["block_M", "block_N", "block_K", "num_stages", "threads", "split"],
             warmup=10,
-            rep=10
-        )
-        @jit(out_idx=[2], supply_type=tilelang.TensorSupplyType.Integer, ref_prog=None, profiler="auto")
-        def kernel(block_M=None, block_N=None, block_K=None, num_stages=None, threads=None, split=None):
+            rep=10)
+        @jit(
+            out_idx=[2],
+            supply_type=tilelang.TensorSupplyType.Integer,
+            ref_prog=None,
+            profiler="auto")
+        def kernel(block_M=None,
+                   block_N=None,
+                   block_K=None,
+                   num_stages=None,
+                   threads=None,
+                   split=None):
             return kernel_func(block_M, block_N, block_K, num_stages, threads, split)
 
         return kernel()
     else:
+
         def kernel(block_M, block_N, block_K, num_stages, threads, split=1):
             return kernel_func(block_M, block_N, block_K, num_stages, threads, split)
 
         return kernel
 
+
 def ref_program(A, qB):
     dtypeC = "float16"
     B = torch_convert(qB)
     C = torch.matmul(A.to(torch.float), B.T.to(torch.float))
     C = C.to(torch.__getattribute__(dtypeC))
     return C.transpose(0, 1)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--m', type=int, default=256, help='M')
@@ -264,7 +288,9 @@ def ref_program(A, qB):
     total_flops = 2 * M * N * K
 
     if (not args.tune):
-        program = matmul(M, N, K, "float16", "float16", "float32", num_bits=4, tune=args.tune)(block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1)
+        program = matmul(
+            M, N, K, "float16", "float16", "float32", num_bits=4, tune=args.tune)(
+                block_M=128, block_N=128, block_K=128, num_stages=2, threads=256, split=1)
         mod, params = tilelang.lower(program)
         mod = Profiler(mod, params, [2], tilelang.TensorSupplyType.Integer)
         mod.assert_allclose(ref_program, rtol=0.01, atol=0.01)
@@ -276,7 +302,8 @@ def ref_program(A, qB):
         print("Tile-lang: {:.2f} ms".format(latency))
         print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
     else:
-        best_latency, best_config, ref_latency = matmul(M, N, K, "float16", "float16", "float32", num_bits=4, tune=args.tune)
+        best_latency, best_config, ref_latency = matmul(
+            M, N, K, "float16", "float16", "float32", num_bits=4, tune=args.tune)
         print(f"Best latency: {best_latency}")
         print(f"Best TFlops: {total_flops / best_latency * 1e-9}")
         print(f"Best config: {best_config}")
@@ -6,18 +6,15 @@
 import tilelang.language as T
 
 
-def matmul(
-    M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"
-):
+def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
+
     @T.prim_func
     def main(
-        A: T.Buffer((M, K), dtype),
-        B: T.Buffer((K, N), dtype),
-        C: T.Buffer((M, N), dtype),
+            A: T.Buffer((M, K), dtype),
+            B: T.Buffer((K, N), dtype),
+            C: T.Buffer((M, N), dtype),
     ):
-        with T.Kernel(
-            T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128
-        ) as (bx, by):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
             A_shared = T.alloc_shared((block_M, block_K), dtype)
             B_shared = T.alloc_shared((block_K, block_N), dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)