tile-ai
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 7 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 38 additions & 17 deletions b/‎setup.py‎
Lines changed: 38 additions & 17 deletions
diff --git a/‎src/op/builtin.cc‎
Lines changed: 0 additions & 5 deletions b/‎src/op/builtin.cc‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎src/op/builtin.h‎
Lines changed: 0 additions & 8 deletions b/‎src/op/builtin.h‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/target/codegen_cuda.cc‎
Lines changed: 0 additions & 2 deletions b/‎src/target/codegen_cuda.cc‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/target/codegen_hip.cc‎
Lines changed: 22 additions & 2 deletions b/‎src/target/codegen_hip.cc‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎src/tl_templates/cuda/common.h‎
Lines changed: 33 additions & 2 deletions b/‎src/tl_templates/cuda/common.h‎
Lines changed: 33 additions & 2 deletions
diff --git a/‎src/transform/common/thread_sync_types.h‎
Lines changed: 51 additions & 0 deletions b/‎src/transform/common/thread_sync_types.h‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎src/transform/storage_access.cc‎
Lines changed: 60 additions & 6 deletions b/‎src/transform/storage_access.cc‎
Lines changed: 60 additions & 6 deletions
@@ -104,18 +104,18 @@ jobs:
     - name: Install project (wheel form)
       run: |
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
-        pip install . --no-user
+        pip install . --no-user -v
 
     - name: Run examples
       run: |
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd examples
         unset PYTHONPATH
-        python -m pytest -n 4 **/test*.py
+        python -m pytest -n 4 **/test*.py -v -r fE
 
     - name: Run tests
       run: |
         source "${{ runner.tool_cache }}/${{ env.VENV_DIR }}/bin/activate"
         cd testing/python
         unset PYTHONPATH
-        python -m pytest -n 4
+        python -m pytest -n 4 -v -r fE
@@ -1,10 +1,16 @@
 [build-system]
 requires = [
+    "build",
     "cmake>=3.26",
-    "cython",
     "packaging",
     "setuptools>=61",
+    "torch",
     "wheel",
+    "tox",
+    "auditwheel",
+    "patchelf",
+    "ninja",
+    "Cython",
 ]
 build-backend = "setuptools.build_meta"
 
 
@@ -112,7 +112,8 @@ def get_nvcc_cuda_version():
 
     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
     """
-    nvcc_output = subprocess.check_output(["nvcc", "-V"], universal_newlines=True)
+    nvcc_path = os.path.join(CUDA_HOME, "bin", "nvcc")
+    nvcc_output = subprocess.check_output([nvcc_path, "-V"], universal_newlines=True)
     output = nvcc_output.split()
     release_idx = output.index("release") + 1
     nvcc_cuda_version = Version(output[release_idx].split(",")[0])
@@ -788,26 +789,46 @@ def build_cmake(self, ext):
             build_temp = os.path.abspath(self.build_temp)
         os.makedirs(build_temp, exist_ok=True)
 
-        # Copy the default 'config.cmake' from the source tree into our build directory.
-        src_config_cmake = os.path.join(ext.sourcedir, "3rdparty", "tvm", "cmake", "config.cmake")
-        dst_config_cmake = os.path.join(build_temp, "config.cmake")
-        shutil.copy(src_config_cmake, dst_config_cmake)
-
-        # Append some configuration variables to 'config.cmake'
-        with open(dst_config_cmake, "a") as config_file:
-            config_file.write(f"set(USE_LLVM {llvm_config_path})\n")
-            if USE_ROCM:
-                config_file.write(f"set(USE_ROCM {ROCM_HOME})\n")
-                config_file.write("set(USE_CUDA OFF)\n")
-            else:
-                config_file.write(f"set(USE_CUDA {CUDA_HOME})\n")
-                config_file.write("set(USE_ROCM OFF)\n")
+        # Paths to the source and destination config.cmake files
+        src_config = Path(ext.sourcedir) / "3rdparty" / "tvm" / "cmake" / "config.cmake"
+        dst_config = Path(build_temp) / "config.cmake"
+
+        # Read the default config template
+        content_lines = src_config.read_text().splitlines()
+
+        # Add common LLVM configuration
+        content_lines.append(f"set(USE_LLVM {llvm_config_path})")
+
+        # Append GPU backend configuration based on environment
+        if USE_ROCM:
+            content_lines += [
+                f"set(USE_ROCM {ROCM_HOME})",
+                "set(USE_CUDA OFF)",
+            ]
+        else:
+            content_lines += [
+                f"set(USE_CUDA {CUDA_HOME})",
+                "set(USE_ROCM OFF)",
+            ]
+
+        # Create the final file content
+        new_content = "\n".join(content_lines) + "\n"
+
+        # Write the file only if it does not exist or has changed
+        if not dst_config.exists() or dst_config.read_text() != new_content:
+            dst_config.write_text(new_content)
+            print(f"[Config] Updated: {dst_config}")
+        else:
+            print(f"[Config] No changes: {dst_config}")
 
         # Run CMake to configure the project with the given arguments.
-        subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp)
+        if not os.path.exists(build_temp + "/build.ninja"):
+            subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp)
 
         # Build the project in "Release" mode with all available CPU cores ("-j").
-        subprocess.check_call(["cmake", "--build", ".", "--config", "Release", "-j"],
+        num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
+        subprocess.check_call(["cmake", "--build", ".", "--config", "Release", "-j",
+                               str(num_jobs)],
                               cwd=build_temp)
 
 
 
@@ -90,11 +90,6 @@ TIR_DEFINE_TL_BUILTIN(ptx_stmatrix)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(sync_thread_partial)
-    .set_num_inputs(2)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kOpaque));
-
 TIR_DEFINE_TL_BUILTIN(fence_proxy_async)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
 
@@ -169,14 +169,6 @@ TVM_DLL const Op &ptx_stmatrix();
  */
 TVM_DLL const Op &pack_b16();
 
-/*!
- * \brief Similar to __syncthreads(), but can be used to sync partial threads
- *
- * sync_thread_partial(num_partial_threads or mbarrier)
- *
- */
-TVM_DLL const Op &sync_thread_partial();
-
 /*!
  * \brief Issue a shared memory fence for async operations
  *
 
@@ -1050,8 +1050,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
     auto phase = this->PrintExpr(op->args[1]);
     this->stream << mbarrier_obj << ".wait(" << phase << ");\n";
-  } else if (op->op.same_as(tl::sync_thread_partial())) {
-    print_extern_call_stmt("cutlass::arch::NamedBarrier::sync");
   } else if (op->op.same_as(tl::no_set_max_nreg())) {
     return;
   } else if (op->op.same_as(tl::tma_load())) {
 
@@ -784,8 +784,28 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     int n = Downcast<IntImm>(op->args[0])->value;
     std::string func_name = "tl::cp_async_wait<" + std::to_string(n) + ">";
     print_extern_call_stmt(func_name, 1);
-  } else if (op->op.same_as(tl::sync_thread_partial())) {
-    print_extern_call_stmt("tl::syncthreads_partial");
+  } else if (op->op.same_as(builtin::create_barriers())) {
+    this->PrintIndent();
+    int barrier_count = Downcast<IntImm>(op->args[0])->value;
+    std::string barrier_name = "_mbarrier";
+    this->stream << "__shared__ uint64_t " << barrier_name << "["
+                 << barrier_count << "];\n";
+  } else if (op->op.same_as(tl::get_mbarrier())) {
+    std::string barrier_name = "_mbarrier";
+    std::string barrier_id = this->PrintExpr(op->args[0]);
+    os << barrier_name + "[" + barrier_id + "]";
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
+    print_extern_call_stmt("tl::mbarrier_arrive");
+  } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
+    print_extern_call_stmt("tl::mbarrier_init");
+  } else if (op->op.same_as(builtin::ptx_arrive_barrier_expect_tx())) {
+    print_extern_call_stmt("tl::mbarrier_arrive_expect_tx");
+  } else if (op->op.same_as(builtin::ptx_cp_async_barrier())) {
+    print_extern_call_stmt("tl::mbarrier_cp_async_arrive");
+  } else if (op->op.same_as(tl::mbarrier_expect_tx())) {
+    print_extern_call_stmt("tl::mbarrier_expect_tx");
+  } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
+    print_extern_call_stmt("tl::mbarrier_wait");
   } else if (op->op.same_as(tl::ptx_stmatrix())) {
     int trans = Downcast<IntImm>(op->args[0])->value;
     int num = Downcast<IntImm>(op->args[1])->value;
 
@@ -241,12 +241,43 @@ TL_DEVICE void __sync_thread_partial() {
   asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(thread_count));
 }
 
+// Template parameter:
+//   thread_extent: the logical size (in number of threads) of each "group"
+//                  within which we want to elect exactly ONE representative
+//                  thread.
 template <int thread_extent> TL_DEVICE bool tl_shuffle_elect() {
+
+  // Special case: thread_extent == 0 means "elect exactly one thread
+  // in the entire thread block", i.e., the leader of the first warp of the
+  // block.
   if constexpr (thread_extent == 0) {
+    // cutlass::canonical_warp_idx_sync():
+    //   Returns the warp ID within the thread block in a "canonical" way
+    //   (0 for the first warp, 1 for the second, ...).
+    // cute::elect_one_sync():
+    //   Elect exactly one lane in the warp to return true (typically lane 0),
+    //   other lanes return false.
+    // The condition ensures that:
+    //   (1) We are in warp 0 of the block.
+    //   (2) We are the elected lane in this warp.
     return cutlass::canonical_warp_idx_sync() == 0 && cute::elect_one_sync();
   }
-  return __shfl_sync(0xffffffff, (threadIdx.x / 32) % (thread_extent / 32),
-                     0) == 0 &&
+
+  // General case: thread_extent != 0
+  // (threadIdx.x / 32) is the warp index in the block.
+  // (thread_extent / 32) is the number of warps in one group of size
+  // thread_extent. We take warp_id % num_warps_in_group to get the warp's index
+  // within the group.
+  // __shfl_sync(mask, value, srcLane): broadcast 'value' from srcLane to all
+  // lanes in the warp. Here it broadcasts the group-local warp index from lane
+  // 0. Comparing to 0 selects only the group's warp 0.
+  return __shfl_sync(0xffffffff, // full warp mask
+                     (threadIdx.x / 32) %
+                         (thread_extent / 32), // warp index within group
+                     0                         // take the value from lane 0
+                     ) == 0 &&
+         // Within that group leader warp, elect exactly one lane (typically
+         // lane 0) to be the single representative for the group.
          cute::elect_one_sync();
 }
 
 
@@ -0,0 +1,51 @@
+/*!
+ * \file thread_sync_types.h
+ */
+#ifndef TVM_TL_THREAD_BOUND_KEY_H_
+#define TVM_TL_THREAD_BOUND_KEY_H_
+
+#include <cstdint>
+#include <functional>
+
+namespace tvm {
+namespace tl {
+
+struct ThreadBoundKey {
+  int64_t tx_min, tx_max, ty_min, ty_max, tz_min, tz_max;
+  bool operator==(const ThreadBoundKey &other) const {
+    return tx_min == other.tx_min && tx_max == other.tx_max &&
+           ty_min == other.ty_min && ty_max == other.ty_max &&
+           tz_min == other.tz_min && tz_max == other.tz_max;
+  }
+};
+
+// There are 16 Named Barriers provided by Hardware starting in Hopper
+// Their IDs are in the range 0-15
+// Number of threads syncing using the barrier must be a multiple of warp-size
+// ID 0 should not be used for safety, as other driver APIs (i.e. __syncthreads)
+// may use it and conflict with other uses.
+enum class ReservedNamedBarriers {
+  kSyncThreads = 0,
+  kReduce_0 = 1,
+  kReduce_1 = 2,
+  kFirstUsedBarrier = kReduce_1 + 1
+};
+
+} // namespace tl
+} // namespace tvm
+
+namespace std {
+template <> struct hash<tvm::tl::ThreadBoundKey> {
+  size_t operator()(const tvm::tl::ThreadBoundKey &k) const {
+    size_t h = std::hash<int64_t>()(k.tx_min);
+    h = h * 31 + std::hash<int64_t>()(k.tx_max);
+    h = h * 31 + std::hash<int64_t>()(k.ty_min);
+    h = h * 31 + std::hash<int64_t>()(k.ty_max);
+    h = h * 31 + std::hash<int64_t>()(k.tz_min);
+    h = h * 31 + std::hash<int64_t>()(k.tz_max);
+    return h;
+  }
+};
+} // namespace std
+
+#endif // TVM_TL_THREAD_BOUND_KEY_H_
@@ -38,6 +38,7 @@ using namespace tir;
 
 void TileLangStorageAccessVisitor::VisitExpr_(const BufferLoadNode *op) {
   Var buf = op->buffer->data;
+  buffer_data_to_buffer_.Set(GetRef<Var>(buf.get()), op->buffer);
   StorageScope scope = GetScope(buf);
   if (Enabled(buf.get(), scope)) {
     ICHECK(allow_append_) << GetRef<BufferLoad>(op) << " " << scope.to_string();
@@ -64,6 +65,7 @@ void TileLangStorageAccessVisitor::VisitStmt_(const BufferStoreNode *op) {
   curr_stmt_.stmt = op;
 
   Var buf = op->buffer->data;
+  buffer_data_to_buffer_.Set(GetRef<Var>(buf.get()), op->buffer);
   StorageScope scope = GetScope(buf);
   if (Enabled(buf.get(), scope)) {
     AccessEntry e;
@@ -115,6 +117,15 @@ void TileLangStorageAccessVisitor::VisitStmt_(const LetStmtNode *op) {
   this->VisitStmt(op->body);
 }
 
+void TileLangStorageAccessVisitor::VisitStmt_(const BlockNode *op) {
+  auto block = Downcast<Block>(op);
+  for (const auto &buffer : block->alloc_buffers) {
+    ICHECK(buffer->IsInstance<BufferNode>());
+    buffer_data_to_buffer_.Set(buffer->data, buffer);
+  }
+  IRVisitorWithAnalyzer::VisitStmt_(op);
+}
+
 void TileLangStorageAccessVisitor::VisitStmt_(const AttrStmtNode *op) {
   if (op->attr_key == tvm::tir::attr::double_buffer_write) {
     ICHECK(double_buffer_write_ == nullptr);
@@ -271,18 +282,27 @@ void TileLangStorageAccessVisitor::VisitExpr_(const CallNode *op) {
       Buffer buffer = load->buffer;
       DataType dtype = buffer->dtype;
       const VarNode *buffer_var = buffer->data.as<VarNode>();
+      buffer_data_to_buffer_.Set(GetRef<Var>(buffer_var), buffer);
       StorageScope scope = GetScope(GetRef<Var>(buffer_var));
+      Array<Range> buffer_ranges;
+      // from indices to buffer indices
+      ICHECK(buffer->shape.size() == load->indices.size());
+      for (size_t i = 0; i < buffer->shape.size(); ++i) {
+        buffer_ranges.push_back(
+            Range::FromMinExtent(load->indices[i], buffer->shape[i]));
+      }
       if (Enabled(buffer_var, scope)) {
         ICHECK(allow_append_);
         AccessEntry e;
         e.threads = env_threads();
         e.thread_range = this->ComputeThreadRange(e.threads);
         e.dtype = dtype;
         e.buffer = Downcast<Var>(buffer->data);
-        e.buffer_indices = load->indices;
+        e.buffer_ranges = buffer_ranges;
         for (const auto &index : load->indices) {
           e.touched.push_back(arith::IntSet::Vector(index));
         }
+        e.is_pointer_access = true;
         e.type = kRead;
         e.scope = scope;
         curr_stmt_.access.emplace_back(e);
@@ -294,20 +314,54 @@ void TileLangStorageAccessVisitor::VisitExpr_(const CallNode *op) {
   } else if (op->op.same_as(builtin::tvm_access_ptr())) {
     ICHECK_EQ(op->args.size(), 5U);
     DataType dtype = op->args[0].dtype();
-    const VarNode *buffer = op->args[1].as<VarNode>();
+    const VarNode *buffer_var = op->args[1].as<VarNode>();
     PrimExpr offset = op->args[2];
     PrimExpr extent = op->args[3];
     const IntImmNode *flag = op->args[4].as<IntImmNode>();
-    StorageScope scope = GetScope(GetRef<Var>(buffer));
+    StorageScope scope = GetScope(GetRef<Var>(buffer_var));
     // The buffer scope.
-    if (Enabled(buffer, scope)) {
+    if (Enabled(buffer_var, scope)) {
       ICHECK(allow_append_);
+      Array<Range> buffer_ranges;
+      if (buffer_data_to_buffer_.find(GetRef<Var>(buffer_var)) ==
+          buffer_data_to_buffer_.end()) {
+        // cannot find buffer map, use the default buffer
+        buffer_ranges = {Range::FromMinExtent(offset, extent)};
+      } else {
+        Buffer buffer = buffer_data_to_buffer_.at(GetRef<Var>(buffer_var));
+        auto buffer_shape = buffer->shape;
+        // convert 1d offset to multi-dimensional index
+        auto linear_to_indices = [this](PrimExpr offset,
+                                        const Array<PrimExpr> &shape) {
+          Array<PrimExpr> indices;
+          PrimExpr remaining = offset;
+          for (size_t i = 0; i < shape.size(); ++i) {
+            PrimExpr stride = make_const(DataType::Int(32), 1);
+            for (size_t j = i + 1; j < shape.size(); ++j) {
+              stride = stride * shape[j];
+            }
+            PrimExpr idx = FloorDiv(remaining, stride);
+            remaining = FloorMod(remaining, stride);
+            indices.push_back(analyzer_.Simplify(idx));
+          }
+          return indices;
+        };
+        Array<PrimExpr> start_indices = linear_to_indices(offset, buffer_shape);
+        Array<PrimExpr> end_indices =
+            linear_to_indices(offset + extent, buffer_shape);
+        for (size_t i = 0; i < buffer_shape.size(); ++i) {
+          buffer_ranges.push_back(Range::FromMinExtent(
+              start_indices[i],
+              analyzer_.Simplify(end_indices[i] - start_indices[i])));
+        }
+      }
       AccessEntry e;
       e.threads = env_threads();
       e.thread_range = this->ComputeThreadRange(e.threads);
       e.dtype = dtype;
-      e.buffer = Downcast<Var>(op->args[1]);
-      e.buffer_indices = {offset, extent};
+      e.buffer = GetRef<Var>(buffer_var);
+      e.buffer_ranges = buffer_ranges;
+      e.is_pointer_access = true;
       e.touched = {
           arith::IntSet::FromRange(Range::FromMinExtent(offset, extent))};
       e.scope = scope;