tile-ai
diff --git a/‎examples/fusedmoe/example_fusedmoe_tilelang.py‎
Lines changed: 0 additions & 2 deletions b/‎examples/fusedmoe/example_fusedmoe_tilelang.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/warp_specialize/example_warp_specialize_flashmla.py‎
Lines changed: 5 additions & 29 deletions b/‎examples/warp_specialize/example_warp_specialize_flashmla.py‎
Lines changed: 5 additions & 29 deletions
diff --git a/‎setup.py‎
Lines changed: 9 additions & 9 deletions b/‎setup.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎src/target/codegen_cuda.cc‎
Lines changed: 70 additions & 0 deletions b/‎src/target/codegen_cuda.cc‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎src/target/codegen_cuda.h‎
Lines changed: 1 addition & 0 deletions b/‎src/target/codegen_cuda.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/tl_templates/hip/reduce.h‎
Lines changed: 2 additions & 1 deletion b/‎src/tl_templates/hip/reduce.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/transform/loop_vectorize.cc‎
Lines changed: 28 additions & 6 deletions b/‎src/transform/loop_vectorize.cc‎
Lines changed: 28 additions & 6 deletions
diff --git a/‎testing/python/language/test_tilelang_language_copy.py‎
Lines changed: 46 additions & 2 deletions b/‎testing/python/language/test_tilelang_language_copy.py‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py‎
Lines changed: 11 additions & 26 deletions b/‎testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py‎
Lines changed: 11 additions & 26 deletions
@@ -7,8 +7,6 @@
 from tilelang.autotuner import *
 from example_fusedmoe_torch import *
 
-# tilelang.disable_cache()
-
 
 @tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
 def moe_forward_tilelang_shared(d_hidden,
 
@@ -145,20 +145,10 @@ def flash_attn(
                         clear_accum=True,
                         wg_wait=-1)
                     T.barrier_wait(kv_shared_0_r_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_r,
-                        KV_shared_0_r,
-                        acc_s_0,
-                        transpose_B=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_0_pe_is_ready, k % 2)
-                    T.gemm(
-                        Q_pe_local_0,
-                        K_pe_shared_0,
-                        acc_s_0,
-                        transpose_B=True,
-                        wg_wait=-1)
+                    T.gemm(Q_pe_local_0, K_pe_shared_0, acc_s_0, transpose_B=True, wg_wait=-1)
 
                     T.wait_wgmma(0)
 
@@ -261,20 +251,10 @@ def flash_attn(
                         wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_r_is_ready, k % 2)
-                    T.gemm(
-                        Q_shared_r,
-                        KV_shared_1_r,
-                        acc_s_1,
-                        transpose_B=True,
-                        wg_wait=-1)
+                    T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
 
                     T.barrier_wait(kv_shared_1_pe_is_ready, k % 2)
-                    T.gemm(
-                        Q_pe_local_1,
-                        K_pe_shared_1,
-                        acc_s_1,
-                        transpose_B=True,
-                        wg_wait=-1)
+                    T.gemm(Q_pe_local_1, K_pe_shared_1, acc_s_1, transpose_B=True, wg_wait=-1)
 
                     T.wait_wgmma(0)
 
@@ -308,11 +288,7 @@ def flash_attn(
 
                     # Step 10. compute O1 with KV_shared_1_rd
                     T.copy(acc_s_1, acc_s_1_cast)
-                    T.gemm(
-                        acc_s_1_cast,
-                        KV_shared_1_r,
-                        acc_o_r,
-                        wg_wait=-1)
+                    T.gemm(acc_s_1_cast, KV_shared_1_r, acc_o_r, wg_wait=-1)
                     T.copy(acc_s_1_cast, SP1_shared)
                     T.barrier_arrive(s_shared_ready_barrier)
 
 
@@ -1,3 +1,6 @@
+import fcntl
+import functools
+import hashlib
 import io
 import subprocess
 import shutil
@@ -12,17 +15,14 @@
 import os
 import sys
 import site
-import hashlib
 import sysconfig
-import functools
 import urllib.request
 from packaging.version import Version
 import platform
 import multiprocessing
 from setuptools.command.build_ext import build_ext
 import importlib
 import logging
-import fcntl
 
 # Configure logging with basic settings
 logging.basicConfig(
@@ -692,15 +692,15 @@ def build_cython(self, ext):
                 with open(md5_path, "r") as f:
                     cached_hash = f.read().strip()
                     if cached_hash == code_hash:
-                        logger.info("Cython jit adapter is up to date, no need to compile...")
+                        logger.info("Cython JIT adapter is up to date, no need to compile...")
                         need_compile = False
                     else:
-                        logger.info("Cython jit adapter is out of date, need to recompile...")
+                        logger.info("Cython JIT adapter is out of date, need to recompile...")
             else:
-                logger.info("No cached version found for cython jit adapter, need to compile...")
+                logger.info("No cached version found for Cython JIT adapter, need to compile...")
 
             if need_compile:
-                logger.info("Waiting for lock to compile cython jit adapter...")
+                logger.info("Waiting for lock to compile Cython JIT adapter...")
                 with open(lock_file, 'w') as lock:
                     fcntl.flock(lock.fileno(), fcntl.LOCK_EX)
                     try:
@@ -715,7 +715,7 @@ def build_cython(self, ext):
                                     need_compile = False
 
                         if need_compile:
-                            logger.info("Compiling cython jit adapter...")
+                            logger.info("Compiling Cython JIT adapter...")
                             temp_path = cache_dir / f"temp_{code_hash}.so"
 
                             with open(md5_path, "w") as f:
@@ -736,7 +736,7 @@ def build_cython(self, ext):
                     except Exception as e:
                         if 'temp_path' in locals() and temp_path.exists():
                             temp_path.unlink()
-                        raise Exception(f"Failed to compile cython jit adapter: {e}") from e
+                        raise Exception(f"Failed to compile Cython JIT adapter: {e}") from e
                     finally:
                         if lock_file.exists():
                             lock_file.unlink()
 
@@ -1702,6 +1702,76 @@ void CodeGenTileLangCUDA::VisitExpr_(const RampNode *op, std::ostream &os) {
   os << "))";
 }
 
+void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
+                                     std::ostream &os) { // NOLINT(*)
+  ICHECK_EQ(op->indices.size(), 1)
+      << "Load from non-flat memory not supported.";
+  ICHECK(!op->predicate.defined())
+      << "Predicated buffer load is not supported.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+  Var buffer_var = op->buffer->data;
+  DataType element_dtype = op->buffer->dtype;
+
+  int lanes = op->dtype.lanes();
+  // delcare type.
+  if (value_dtype.lanes() == element_dtype.lanes()) {
+    std::string ref = GetBufferRef(op->dtype, op->buffer.get(), index);
+    HandleVolatileLoads(ref, op, os);
+  } else {
+    bool can_vector_load = false;
+    arith::PVar<PrimExpr> base;
+    if (arith::ramp(base, 1, op->dtype.lanes()).Match(index)) {
+      const RampNode *ramp = index.as<RampNode>();
+      ICHECK(ramp);
+      can_vector_load = true;
+      // arith::ModularSet me = arith::Analyzer().modular_set(ramp->base);
+      // The condition: {k * coeff + base} divisible by the alignment for any k
+      // if (me->coeff % op->dtype.lanes() == 0 && me->base % op->dtype.lanes()
+      // == 0) {
+      //   can_vector_load = true;
+      // }
+    }
+
+    if (value_dtype.is_float4_e2m1fn() && lanes != 1) {
+      // A float4_e2m1fn element has 4 bits, which is an incomplete byte.
+      // So we cannot vector load it.
+      can_vector_load = false;
+    }
+    if (can_vector_load) {
+      std::string ref = GetVecLoad(op->dtype, op->buffer.get(), base.Eval());
+      HandleVolatileLoads(ref, op, os);
+    } else {
+      std::ostringstream svalue_expr;
+      std::string sindex = SSAGetID(PrintExpr(index), index.dtype());
+      std::string vid = GetVarID(buffer_var.get());
+      DataType elem_type = op->dtype.element_of();
+      for (int i = 0; i < lanes; ++i) {
+        std::ostringstream value_temp;
+        if (!HandleTypeMatch(buffer_var.get(), elem_type)) {
+          value_temp << "((";
+          if (buffer_var.get()->dtype.is_handle()) {
+            auto it = alloc_storage_scope_.find(buffer_var.get());
+            if (it != alloc_storage_scope_.end()) {
+              PrintStorageScope(it->second, value_temp);
+            }
+          }
+          PrintType(elem_type, value_temp);
+          value_temp << "*)" << vid << ')';
+        } else {
+          value_temp << vid;
+        }
+        value_temp << '[';
+        PrintVecElemLoad(sindex, index.dtype(), i, value_temp);
+        value_temp << ']';
+        PrintVecElemLoadExpr(op->dtype, i, value_temp.str(), svalue_expr);
+      }
+      os << svalue_expr.str();
+    }
+  }
+}
+
 void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
                                      std::ostream &os) { // NOLINT(*)
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
 
@@ -50,6 +50,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   void VisitStmt_(const EvaluateNode *op) final;
   void VisitStmt_(const AllocateNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
+  void VisitExpr_(const BufferLoadNode *op, std::ostream &os) final;
 
   // Override this as a work around for __grid_constant__ parameter
   void AddFunction(const GlobalVar &gvar, const PrimFunc &f);
 
@@ -22,7 +22,8 @@ struct MinOp {
   }
 };
 
-template <class Reducer, int threads, int scale, int thread_offset = 0> struct AllReduce {
+template <class Reducer, int threads, int scale, int thread_offset = 0>
+struct AllReduce {
   static_assert(threads == 1024 || threads == 512 || threads == 256 ||
                 threads == 128 || threads == 64 || threads == 32 ||
                 threads == 16 || threads == 8 || threads == 4 || threads == 2);
 
@@ -136,11 +136,23 @@ class VectorizePlanner : public arith::IRVisitorWithAnalyzer {
         max_vector_size = gcd_base;
       }
       vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
+
+      // Generate strides if not existed
+      auto strides = buffer->strides;
+      if (buffer->strides.size() == 0) {
+        PrimExpr stride = 1;
+        for (int i = indices.size() - 1; i >= 0; --i) {
+          strides.push_back(stride);
+          stride = stride * buffer->shape[i];
+        }
+        strides = Array<PrimExpr>{strides.rbegin(), strides.rend()};
+      }
+
+      // Generate and check element offset expression
+      ICHECK(indices.size() == strides.size()) << "Invalid indices and strides";
       PrimExpr elem_offset = 0;
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        elem_offset = elem_offset + indices[i] * stride;
-        stride = stride * buffer->shape[i];
+      for (int i = 0; i < indices.size(); ++i) {
+        elem_offset += indices[i] * strides[i];
       }
       while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
                                  inner_for_->extent, vector_size_,
@@ -229,10 +241,19 @@ bool IndiceCanVectorize(PrimExpr expr, Var var, PrimExpr iter_var_size,
   ICHECK(target_vectorized_size >= 1);
   if (target_vectorized_size == 1)
     return true;
-  // bind thread range
+
+  // Extent must be divisible
   if (!analyzer->CanProveEqual(FloorMod(iter_var_size, target_vectorized_size),
                                0))
     return false;
+
+  // The base offset must be divisible
+  if (!analyzer->CanProveEqual(
+          FloorMod(Substitute(expr, {{var, 0}}), target_vectorized_size), 0)) {
+    return false;
+  }
+
+  // Bind thread range
   Var v0("v0"), v1("v1");
   analyzer->Bind(v0, Range(0, target_vectorized_size));
   analyzer->Bind(v1, Range(0, analyzer->Simplify(FloorDiv(
@@ -241,7 +262,8 @@ bool IndiceCanVectorize(PrimExpr expr, Var var, PrimExpr iter_var_size,
       Substitute(expr, {{var, v0 + v1 * target_vectorized_size}}));
   Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size));
   PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed);
-  // This simplify is necessary for thread region specifiled
+
+  // This simplify is necessary for thread region specified
   // optimizations.
   expr_vectorized = analyzer->Simplify(expr_vectorized);
   auto ramp_node = expr_vectorized.as<RampNode>();
 
@@ -28,8 +28,8 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype="float16")
         out_idx=[1],
         target="cuda",
         pass_configs={
-            "tl.disable_warp_specialized": True,
-            "tl.disable_tma_lower": True
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True
         })
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
@@ -42,5 +42,49 @@ def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=576, block_M=32, block_N=576, dtype="float")
 
 
+def tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype="float16"):
+
+    @T.prim_func
+    def main(
+            A: T.StridedTensor((M, N), (NN, 1), dtype),
+            B: T.Tensor((M, N), dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                B[by * block_M + i, bx * block_N + j] = A[by * block_M + i, bx * block_N + j]
+
+    return main
+
+
+def run_tilelang_copy_with_stride(M=1024,
+                                  N=1024,
+                                  NN=2048,
+                                  block_M=128,
+                                  block_N=128,
+                                  dtype="float16"):
+    if isinstance(NN, int):
+        assert NN > N, "NN must be greater than N"
+    program = tilelang_copy_with_stride(M, N, NN, block_M, block_N, dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        })
+    if isinstance(NN, T.Var):
+        NN = N * 2
+    a = torch.randn(M, NN, device="cuda", dtype=getattr(torch, dtype))
+    b = kernel(a[:, :N])
+    torch.testing.assert_close(b, a[:, :N], rtol=1e-2, atol=1e-2)
+
+
+def test_tilelang_copy_with_stride():
+    run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=128)
+    run_tilelang_copy_with_stride(M=1024, N=1024, NN=T.symbolic("NN"), block_M=128, block_N=128)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
@@ -9,6 +9,7 @@ def _check(original, transformed):
     mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
     mod = tl.transform.InjectSoftwarePipeline()(mod)
     mod = tl.transform.Simplify()(mod)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
     tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"),
                                    True)
 
@@ -39,32 +40,16 @@ def before(A: T.Tensor((16, 1), "float32"), C: T.Tensor((16, 1), "float32")):
                         C[tx, i] = B[tx, 0] + T.float32(1)
 
     @T.prim_func
-    def expected(A: T.Buffer((16, 1), "float32"), C: T.Buffer((16, 1), "float32")):
-        for tx in T.thread_binding(16, thread="threadIdx.x"):
-            with T.block():
-                T.reads(A[tx, 0])
-                T.writes(C[tx, 0])
-                B = T.alloc_buffer((2, 16, 1), scope="shared")
-                with T.block():
-                    T.reads(A[tx, 0])
-                    T.writes(B[0, tx, 0])
-                    B[0, tx, 0] = A[tx, 0] * T.float32(2.0)
-                with T.block():
-                    T.reads(A[tx, 1:1], B[0:2, tx, 0])
-                    T.writes(B[1:1, tx, 0], C[tx, 0:0])
-                    for i in range(0):
-                        with T.block():
-                            T.reads(A[tx, i + 1])
-                            T.writes(B[i + 1, tx, 0])
-                            B[i + 1, tx, 0] = A[tx, i + 1] * T.float32(2.0)
-                        with T.block():
-                            T.reads(B[i, tx, 0])
-                            T.writes(C[tx, i])
-                            C[tx, i] = B[i, tx, 0] + T.float32(1.0)
-                with T.block():
-                    T.reads(B[0, tx, 0])
-                    T.writes(C[tx, 0])
-                    C[tx, 0] = B[0, tx, 0] + T.float32(1.0)
+    def expected(A_handle: T.handle, C_handle: T.handle):
+        A = T.match_buffer(A_handle, (16, 1), strides=(1, 1))
+        C = T.match_buffer(C_handle, (16, 1), strides=(1, 1))
+        tx = T.launch_thread("threadIdx.x", 16)
+        B = T.decl_buffer((2, 16, 1), scope="shared")
+        B[0, tx, 0] = A[tx, 0] * T.float32(2.0)
+        for i in range(0):
+            B[i + 1, tx, 0] = A[tx, i + 1] * T.float32(2.0)
+            C[tx, i] = B[i, tx, 0] + T.float32(1.0)
+        C[tx, 0] = B[0, tx, 0] + T.float32(1.0)
 
     _check(before, expected)
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,8 @@ struct MinOp {`
`22`	`22`	`}`
`23`	`23`	`};`
`24`	`24`
`25`		`-template <class Reducer, int threads, int scale, int thread_offset = 0> struct AllReduce {`
	`25`	`+template <class Reducer, int threads, int scale, int thread_offset = 0>`
	`26`	`+struct AllReduce {`
`26`	`27`	`static_assert(threads == 1024 \|\| threads == 512 \|\| threads == 256 \|\|`
`27`	`28`	`threads == 128 \|\| threads == 64 \|\| threads == 32 \|\|`
`28`	`29`	`threads == 16 \|\| threads == 8 \|\| threads == 4 \|\| threads == 2);`