tile-ai
diff --git a/‎src/layout/gemm_layouts.cc‎
Lines changed: 4 additions & 2 deletions b/‎src/layout/gemm_layouts.cc‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/op/copy.cc‎
Lines changed: 0 additions & 1 deletion b/‎src/op/copy.cc‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/op/gemm.cc‎
Lines changed: 7 additions & 2 deletions b/‎src/op/gemm.cc‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/op/gemm_py.cc‎
Lines changed: 8 additions & 7 deletions b/‎src/op/gemm_py.cc‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/target/codegen_cuda.cc‎
Lines changed: 2 additions & 6 deletions b/‎src/target/codegen_cuda.cc‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎src/transform/inject_pipeline.cc‎
Lines changed: 3 additions & 2 deletions b/‎src/transform/inject_pipeline.cc‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py‎
Lines changed: 62 additions & 45 deletions b/‎testing/python/tilelibrary/test_tilelang_tilelibrary_gemm.py‎
Lines changed: 62 additions & 45 deletions
diff --git a/‎tilelang/intrinsics/mma_layout.py‎
Lines changed: 34 additions & 4 deletions b/‎tilelang/intrinsics/mma_layout.py‎
Lines changed: 34 additions & 4 deletions
@@ -205,14 +205,16 @@ Fragment makeGemmFragmentB(const int block_m, const int block_n,
   ICHECK(block_k % 16 == 0);
   if (transposed) {
     auto base_layout = makeGemmFragment8x8()->Repeat({1, 2}, false, false);
-    auto warp_layout = base_layout->Replicate(block_m / warp_m)->Repeat({block_n / warp_n, 1}, true, false);
+    auto warp_layout = base_layout->Replicate(block_m / warp_m)
+                           ->Repeat({block_n / warp_n, 1}, true, false);
     auto block_layout =
         warp_layout->Repeat({warp_n / 8, block_k / 16}, false, false);
     return block_layout;
   } else {
     auto base_layout =
         makeGemmFragment8x8Transposed()->Repeat({2, 1}, false, false);
-    auto warp_layout = base_layout->Replicate(block_m / warp_m)->Repeat({1, block_n / warp_n}, true);
+    auto warp_layout = base_layout->Replicate(block_m / warp_m)
+                           ->Repeat({1, block_n / warp_n}, true);
     auto block_layout =
         warp_layout->Repeat({block_k / 16, warp_n / 8}, false, true);
     return block_layout;
 
@@ -402,7 +402,6 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
   PassContext pass_ctx = PassContext::Current();
   bool disable_tma_lower =
       pass_ctx->GetConfig<bool>(kDisableTMALower, false).value();
-
   auto copy_inst = GetCopyInst(target, disable_tma_lower || disable_tma,
                                T.layout_map, T.analyzer, T.buffer_oob);
   if (copy_inst == CopyInst::kBulkLoad || copy_inst == CopyInst::kBulkStore) {
 
@@ -244,14 +244,20 @@ GemmWarpPolicyNode::ComputeWarpPartition(int M, int N, int block_size,
     int best_m = 1;
     int best_n = 1;
     float best_balance = std::numeric_limits<float>::max();
-
     // Try all possible combinations that satisfy the constraints
     for (int m = 1; m <= max_m_warps && m <= num_warps; m++) {
       int n = num_warps / m;
 
       // Calculate how balanced this partition is
       float m_per_warp = static_cast<float>(M) / (m * kMPerWarp);
       float n_per_warp = static_cast<float>(N) / (n * kNPerWarp);
+      // m_per_warp and n_per_warp must be greater than 1
+      if (m_per_warp < 1 || n_per_warp < 1)
+        continue;
+      // m * n must equal num_warps
+      if (m * n != num_warps)
+        continue;
+
       float balance = std::abs(m_per_warp / n_per_warp - ideal_ratio);
 
       if (balance < best_balance) {
@@ -266,7 +272,6 @@ GemmWarpPolicyNode::ComputeWarpPartition(int M, int N, int block_size,
   } else {
     ICHECK(0) << "Unknown GemmWarpPolicy";
   }
-
   // Store the computed values in the object's member variables
   this->m_warp = m_warp;
   this->n_warp = n_warp;
 
@@ -234,18 +234,19 @@ Stmt GemmPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       BlockRealize block_realize = Downcast<BlockRealize>(prim_func->body);
       auto block = block_realize->block;
       {
-        BlockNode* n = block.CopyOnWrite();
+        BlockNode *n = block.CopyOnWrite();
         n->name_hint = global_symbol.value();
       }
-      return BlockRealize(block_realize->iter_values, block_realize->predicate, block);
+      return BlockRealize(block_realize->iter_values, block_realize->predicate,
+                          block);
     }
     // warp with block realize node
     return BlockRealize(
-      /*iter_values=*/Array<PrimExpr>(),
-      /*predicate=*/const_true(),
-      /*block=*/
-      Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
-            /*name_hint=*/global_symbol.value(), prim_func->body)); 
+        /*iter_values=*/Array<PrimExpr>(),
+        /*predicate=*/const_true(),
+        /*block=*/
+        Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
+              /*name_hint=*/global_symbol.value(), prim_func->body));
   } else {
     LOG(FATAL) << "No lower function found for gemm_py";
   }
 
@@ -1331,16 +1331,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
       os << "}\n";
     } else {
       std::string smem_elem_offset = this->PrintExpr(op->args[6]);
-      // need_cast_smem_ptr_to_int_ = true;
-      // this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr,
-      //                                         local_elem_offset, smem_ptr,
-      //                                         smem_elem_offset);
       std::string func_name = "tl::ptx_ldmatrix_x" + std::to_string(num);
       if (trans == 1)
         func_name += "_trans";
-      // this->stream << func_name << "(" << local_ptr "" << ", " << smem_ptr << ");\n";
       this->PrintIndent();
-      this->stream << func_name << "(" << smem_ptr << " + " << smem_elem_offset<< ", " << local_ptr << " + " << local_elem_offset << ");\n";
+      this->stream << func_name << "(" << smem_ptr << " + " << smem_elem_offset
+                   << ", " << local_ptr << " + " << local_elem_offset << ");\n";
     }
   } else if (op->op.same_as(builtin::mma_store())) {
     int m = Downcast<Integer>(op->args[0])->value;
 
@@ -951,8 +951,9 @@ class PipelineInjector : private StmtExprMutator {
 
     Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
 
-    Array<Array<BufferRegion>> access = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
-    BlockNode* n = block.CopyOnWrite();
+    Array<Array<BufferRegion>> access =
+        GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
+    BlockNode *n = block.CopyOnWrite();
     n->reads = access[0];
     n->writes = access[1];
 
 
@@ -1,4 +1,3 @@
-from asyncio import threads
 from tilelang import tvm as tvm
 import tilelang.testing
 
@@ -90,7 +89,9 @@ def run_gemm_ss(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         })
-    profiler = kernel.get_profiler()
+
+    print(kernel.get_kernel_source())
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
         import torch
@@ -109,11 +110,21 @@ def ref_program(A, B):
 def test_gemm_ss():
     # More test case can be found in kernel/test_tilelang_kernel_gemm.py
     # GEMM tests for float16
-    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
-    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 0)
-    
+    run_gemm_ss(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 128, 32, 2)
+    run_gemm_ss(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 128, 32, 2)
+    run_gemm_ss(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 128, 32, 2)
+    run_gemm_ss(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 128, 32, 2)
+    # n8 test
+    run_gemm_ss(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
+
+    # int8 test
+    run_gemm_ss(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_ss(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_ss(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_ss(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
+
+    # float8 tests
+    run_gemm_ss(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
 
 
 def matmul_rs(
@@ -208,7 +219,7 @@ def run_gemm_rs(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         })
-    profiler = kernel.get_profiler()
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
         import torch
@@ -226,8 +237,22 @@ def ref_program(A, B):
 
 def test_gemm_rs():
     # GEMM tests for float16
-    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 0)
-    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 0)
+    run_gemm_rs(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_rs(512, 1024, 768, False, True, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_rs(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
+    run_gemm_rs(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
+
+    # n8 tests
+    run_gemm_rs(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
+
+    # int8 tests
+    run_gemm_rs(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_rs(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_rs(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_rs(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
+
+    # float8 tests
+    run_gemm_rs(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
 
 
 def matmul_sr(
@@ -322,7 +347,7 @@ def run_gemm_sr(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         })
-    profiler = kernel.get_profiler()
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
         import torch
@@ -345,6 +370,18 @@ def test_gemm_sr():
     run_gemm_sr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
     run_gemm_sr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
 
+    # n8 tests
+    run_gemm_sr(128, 8, 32, False, True, "float16", "float16", "float16", 128, 8, 32, 0, 128)
+
+    # int8 tests
+    run_gemm_sr(128, 128, 32, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_sr(128, 128, 32, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_sr(128, 128, 32, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_sr(128, 128, 32, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
+
+    # float8 tests
+    run_gemm_sr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
+
 
 def matmul_rr(
     M,
@@ -442,7 +479,7 @@ def run_gemm_rr(
             tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
             tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         })
-    profiler = kernel.get_profiler()
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
 
     def ref_program(A, B):
         import torch
@@ -465,40 +502,20 @@ def test_gemm_rr():
     run_gemm_rr(512, 1024, 768, True, False, "float16", "float16", "float16", 128, 256, 32, 2)
     run_gemm_rr(512, 1024, 768, True, True, "float16", "float16", "float16", 128, 256, 32, 2)
     run_gemm_rr(512, 1024, 768, False, True, "bfloat16", "bfloat16", "float", 128, 256, 32, 2)
+    # n8 tests
+    run_gemm_rr(128, 8, 128, False, True, "float16", "float16", "float16", 128, 8, 32, 2)
+    run_gemm_rr(128, 8, 128, False, True, "int8", "int8", "int32", 128, 8, 32, 2)
+
+    # int8 tests
+    run_gemm_rr(128, 128, 128, False, True, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_rr(128, 128, 128, False, False, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_rr(128, 128, 128, True, False, "int8", "int8", "int32", 128, 128, 32, 2)
+    run_gemm_rr(128, 128, 128, True, True, "int8", "int8", "int32", 128, 128, 32, 2)
+
+    # float8 tests
+    run_gemm_rr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
 
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    tilelang.disable_cache()
-    # test_gemm_ss()
-    # test_gemm_sr()
-    # test_gemm_rs()
-    # test_gemm_rr()
-    
-    # run_gemm_sr(128, 128, 128, False, False, "float16", "float16", "float16", 128, 128, 32, 2)
-    # tilelang.testing.set_random_seed(42)
-    run_gemm_ss(128, 128, 128, False, True, "float16", "float16", "float16", 128, 128, 32, 1)
-    # print("gemm fp16 nt ss done")
-    # exit()
-    
-    # run_gemm_rs(128, 128, 32, False, True, "float16", "float16", "float16", 128, 128, 32, 0)
-    # print("gemm fp16 nt rs done")
-    # run_gemm_rs(128, 128, 32, False, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    # print("gemm fp16 nn rs done")
-    # run_gemm_rs(128, 128, 32, True, False, "float16", "float16", "float16", 128, 128, 32, 0)
-    # print("gemm fp16 tn rs done")
-    # run_gemm_rs(128, 128, 32, True, True, "float16", "float16", "float16", 128, 128, 32, 0)
-    # print("gemm fp16 tt rs done")
-
-    # run_gemm_rs(16, 16, 16, True, False, "float16", "float16", "float16", 16, 16, 16, 0, 32)
-
-    # run_gemm_rr(128, 128, 32, False, False, "bfloat16", "bfloat16", "float", 128, 128, 32, 0)
-    # print("gemm bf16 nn rr done")
-    # run_gemm_rr(128, 128, 32, False, True, "bfloat16", "bfloat16", "float", 128, 128, 32, 0)
-    # print("gemm bf16 nt rr done")
-    # run_gemm_rr(128, 128, 32, True, False, "bfloat16", "bfloat16", "float", 128, 128, 32, 0)
-    # print("gemm bf16 tn rr done")
-    # run_gemm_rr(128, 128, 32, True, True, "bfloat16", "bfloat16", "float", 128, 128, 32, 0)
-    # print("gemm bf16 tt rr done")
-    
-    
+    run_gemm_rr(128, 128, 128, True, True, "float8_e5m2", "float8_e5m2", "float32", 128, 128, 32, 2)
@@ -52,31 +52,49 @@ def shared_16x16_to_mma_a_32x8_layout(i, j):
     thread_id = 4 * (i % 8) + (j % 8) // 2
     return thread_id, 4 * (j // 8) + (i // 8) * 2 + (j % 2)
 
+
 def shared_16x16_to_mma_a_32x8_layout_trans(i, j):
     return shared_16x16_to_mma_a_32x8_layout(j, i)
 
+
 # mma.sync matrix B layout, if wanna trans, please apply map_indices
 def shared_16x16_to_mma_b_32x8_layout(i, j):
     thread_id = 4 * (i % 8) + (j % 8) // 2
     return thread_id, 4 * (i // 8) + (j // 8) * 2 + (j % 2)
 
+
 def shared_16x16_to_mma_b_32x8_layout_trans(i, j):
     return shared_16x16_to_mma_b_32x8_layout(j, i)
 
+
 shared_16x16_to_mma_32x8_layout_sr_a = shared_16x16_to_mma_a_32x8_layout
 shared_16x16_to_mma_32x8_layout_sr_b = shared_16x16_to_mma_b_32x8_layout
 shared_16x16_to_mma_32x8_layout_rs_a = shared_16x16_to_mma_a_32x8_layout_trans
 shared_16x16_to_mma_32x8_layout_rs_b = shared_16x16_to_mma_b_32x8_layout_trans
 
 
-def shared_16x32_to_mma_32x16_layout(i, j):
+def shared_16x32_to_mma_a_32x16_layout(i, j):
     thread_id = 4 * (i % 8) + (j % 16) // 4
     return thread_id, 8 * (j // 16) + (i // 8) * 4 + j % 4
 
 
-def shared_32x16_to_mma_32x16_layout(i, j):
-    thread_id = (i % 16) // 4 + 4 * (j % 8)
-    return thread_id, 8 * (j // 8) + (i // 16) * 4 + i % 4
+def shared_32x16_to_mma_a_32x16_layout_trans(i, j):
+    return shared_16x32_to_mma_a_32x16_layout(j, i)
+
+
+def shared_16x32_to_mma_b_32x16_layout(i, j):
+    thread_id = 4 * (i % 8) + (j % 16) // 4
+    return thread_id, 8 * (i // 8) + (j // 16) * 4 + j % 4
+
+
+def shared_32x16_to_mma_b_32x16_layout_trans(i, j):
+    return shared_16x32_to_mma_b_32x16_layout(j, i)
+
+
+shared_16x32_to_mma_32x16_layout_sr_a = shared_16x32_to_mma_a_32x16_layout
+shared_16x32_to_mma_32x16_layout_sr_b = shared_16x32_to_mma_b_32x16_layout
+shared_16x32_to_mma_32x16_layout_rs_a = shared_32x16_to_mma_a_32x16_layout_trans
+shared_16x32_to_mma_32x16_layout_rs_b = shared_32x16_to_mma_b_32x16_layout_trans
 
 
 def mma_32x8_to_shared_16x16_layout(thread_id, local_id):
@@ -85,6 +103,18 @@ def mma_32x8_to_shared_16x16_layout(thread_id, local_id):
     return row, col
 
 
+def mma_load_a_32x16_to_shared_16x32_layout(thread_id, local_id):
+    row = 8 * (local_id % 8 // 4) + (thread_id // 4)
+    col = 16 * (local_id // 8) + (thread_id % 4) * 4 + (local_id % 4)
+    return row, col
+
+
+def mma_load_b_32x16_to_shared_16x32_layout(thread_id, local_id):
+    row = 8 * (local_id // 8) + (thread_id // 4)
+    col = 16 * (local_id % 8 // 4) + (thread_id % 4) * 4 + (local_id % 4)
+    return row, col
+
+
 def shared_16x16_to_mma_32x8_smoothlayout(i, j):
     return (i * 2 + j // 8, j % 8)