tile-ai · LeiWang1999 · Aug 31, 2025 · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/setup.py b/setup.py
@@ -767,8 +767,6 @@ def build_cmake(self, ext):
         if self.inplace:
             extdir = os.path.abspath('./tilelang/lib/')
 
-        logger.info(f"{extdir=}")
-
         # Prepare arguments for the CMake configuration step.
         # -DCMAKE_LIBRARY_OUTPUT_DIRECTORY sets where built libraries go
         # -DPYTHON_EXECUTABLE ensures that the correct Python is used

diff --git a/src/op/builtin.h b/src/op/builtin.h
@@ -129,6 +129,14 @@ TVM_DLL const Op &tma_load_im2col();
  */
 TVM_DLL const Op &tma_store();
 
+/*!
+ * \brief tvm intrinsics for barrier initialization fence
+ *
+ * ptx_fence_barrier_init()
+ *
+ */
+const Op &ptx_fence_barrier_init();
+
-/*!
- * \brief tvm intrinsics for barrier initialization fence
- *
- * ptx_fence_barrier_init()
- *
- */
-const Op &ptx_fence_barrier_init();
+/*!
+ * \brief tvm intrinsics for barrier initialization fence
+ *
+ * ptx_fence_barrier_init()
+ *
+ */
+TVM_DLL const Op &ptx_fence_barrier_init();
-/*!
- * \brief tvm intrinsics for barrier initialization fence
- *
- * ptx_fence_barrier_init()
- *
- */
-const Op &ptx_fence_barrier_init();
+/*!
+ * \brief tvm intrinsics for barrier initialization fence
+ *
+ * ptx_fence_barrier_init()
+ *
+ */
+TVM_DLL const Op &ptx_fence_barrier_init();
-/*!
- * \brief tvm intrinsics for barrier initialization fence
- *
- * ptx_fence_barrier_init()
- *
- */
-const Op &ptx_fence_barrier_init();
+/*!
+ * \brief tvm intrinsics for barrier initialization fence
+ *
+ * ptx_fence_barrier_init()
+ *
+ */
+TVM_DLL const Op &ptx_fence_barrier_init();
-/*!
- * \brief tvm intrinsics for barrier initialization fence
- *
- * ptx_fence_barrier_init()
- *
- */
-const Op &ptx_fence_barrier_init();
+/*!
+ * \brief tvm intrinsics for barrier initialization fence
+ *
+ * ptx_fence_barrier_init()
+ *
+ */
+TVM_DLL const Op &ptx_fence_barrier_init();
 /*!
  * \brief tvm intrinsics for mbarrier wait with parity bit
  *

diff --git a/src/op/finalize_reducer.cc b/src/op/finalize_reducer.cc
@@ -0,0 +1,101 @@
+/*!
+ * \file src/op/finalize_reducer.cc
+ *
+ * Define finalize_reducer operator.
+ */
+
+#include "finalize_reducer.h"
+
+#include <tvm/arith/iter_affine_map.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include "../target/utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
+  auto node = make_object<FinalizeReducerOpNode>();
+  node->reducer = vmap[GetVarFromAccessPtr(args[0])];
+  node->op = (ReducerOpType)*as_const_int(args[1]);
+  data_ = std::move(node);
+}
-FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
-  auto node = make_object<FinalizeReducerOpNode>();
-  node->reducer = vmap[GetVarFromAccessPtr(args[0])];
-  node->op = (ReducerOpType)*as_const_int(args[1]);
-  data_ = std::move(node);
-}
+FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
+  auto node = make_object<FinalizeReducerOpNode>();
+  ICHECK_EQ(args.size(), 2) << "FinalizeReducer expects exactly 2 arguments";
+  Var data_var = GetVarFromAccessPtr(args[0]);
+  ICHECK(vmap.count(data_var)) << "Unknown reducer buffer var in access_ptr";
+  node->reducer = vmap[data_var];
+  const int64_t* op_i64 = as_const_int(args[1]);
+  ICHECK(op_i64) << "Second argument must be a constant integer (ReducerOpType)";
+  node->op = static_cast<ReducerOpType>(*op_i64);
+  data_ = std::move(node);
+}
-FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
-  auto node = make_object<FinalizeReducerOpNode>();
-  node->reducer = vmap[GetVarFromAccessPtr(args[0])];
-  node->op = (ReducerOpType)*as_const_int(args[1]);
-  data_ = std::move(node);
-}
+FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
+  auto node = make_object<FinalizeReducerOpNode>();
+  ICHECK_EQ(args.size(), 2) << "FinalizeReducer expects exactly 2 arguments";
+  Var data_var = GetVarFromAccessPtr(args[0]);
+  ICHECK(vmap.count(data_var)) << "Unknown reducer buffer var in access_ptr";
+  node->reducer = vmap[data_var];
+  const int64_t* op_i64 = as_const_int(args[1]);
+  ICHECK(op_i64) << "Second argument must be a constant integer (ReducerOpType)";
+  node->op = static_cast<ReducerOpType>(*op_i64);
+  data_ = std::move(node);
+}
+
+Stmt FinalizeReducerOpNode::Lower(const LowerArgs &T,
+                                  arith::Analyzer *analyzer) const {
+  auto buffer = T.buffer_remap[reducer];
+  auto opt_layout = T.layout_map.Get(reducer);
+  ICHECK(opt_layout);
+  ICHECK(opt_layout->as<Fragment>());
+  auto layout = opt_layout->as<Fragment>().value();
+  Array<PrimExpr> indices_0;
+  indices_0.reserve(layout->OutputDim());
+  for (int i = 0; i < layout->OutputDim(); ++i)
+    indices_0.push_back(Var("__finred_" + std::to_string(i)));
+
+  const int64_t *p_extent = as_const_int(layout->ReplicateExtent());
+  ICHECK(p_extent);
+  int extent = *p_extent, scale = 1;
+  ICHECK(extent == 1 || extent == *as_const_int(T.thread_bounds->extent))
+      << "Illegal finalize_reducer: extent=" << extent
+      << "; T.thread_bounds=" << T.thread_bounds;
+
-  const int64_t *p_extent = as_const_int(layout->ReplicateExtent());
-  ICHECK(p_extent);
-  int extent = *p_extent, scale = 1;
-  ICHECK(extent == 1 || extent == *as_const_int(T.thread_bounds->extent))
-      << "Illegal finalize_reducer: extent=" << extent
-      << "; T.thread_bounds=" << T.thread_bounds;
+  const int64_t *p_extent = as_const_int(layout->ReplicateExtent());
+  ICHECK(p_extent);
+  int extent = *p_extent;
+  const int64_t* p_total = as_const_int(T.thread_bounds->extent);
+  ICHECK(p_total) << "T.thread_bounds->extent must be a constant integer";
+  int total_threads = static_cast<int>(*p_total);
+  ICHECK(extent == 1 || extent == total_threads)
+      << "Illegal finalize_reducer: extent=" << extent
+      << "; T.thread_bounds=" << T.thread_bounds;
-  const int64_t *p_extent = as_const_int(layout->ReplicateExtent());
-  ICHECK(p_extent);
-  int extent = *p_extent, scale = 1;
-  ICHECK(extent == 1 || extent == *as_const_int(T.thread_bounds->extent))
-      << "Illegal finalize_reducer: extent=" << extent
-      << "; T.thread_bounds=" << T.thread_bounds;
+  const int64_t *p_extent = as_const_int(layout->ReplicateExtent());
+  ICHECK(p_extent);
+  int extent = *p_extent;
+  const int64_t* p_total = as_const_int(T.thread_bounds->extent);
+  ICHECK(p_total) << "T.thread_bounds->extent must be a constant integer";
+  int total_threads = static_cast<int>(*p_total);
+  ICHECK(extent == 1 || extent == total_threads)
+      << "Illegal finalize_reducer: extent=" << extent
+      << "; T.thread_bounds=" << T.thread_bounds;
+  if (extent == 1)
+    return Evaluate(0);
+
+  std::array op_names{"tl::SumOp", "tl::MaxOp", "tl::MinOp"};
+  auto op_str = op_names[(int)op];
+
+  // adopted from ReduceOp
+  int reducing_threads = extent;
+  std::stringstream ss;
+  auto thread_offset = T.thread_bounds->min;
+  if (TargetIsHopper(T.target)) {
+    auto all_threads = T.thread_bounds->extent;
+    ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
+       << ", " << thread_offset << ", " << all_threads << ">::run_hopper";
+  } else {
+    ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
+       << ", " << thread_offset << ">::run";
+  }
+  Array<PrimExpr> thread_reduce_args = {StringImm(ss.str()),
+                                        BufferLoad(buffer, indices_0)};
+  if (reducing_threads >= 32) {
+    PrimExpr workspace =
+        T.AddWorkspace(*as_const_int(T.thread_bounds->extent), buffer->dtype);
+    thread_reduce_args.push_back(workspace);
+  }
+  auto call = Call(buffer->dtype, builtin::call_extern(), thread_reduce_args);
+  Stmt body = BufferStore(buffer, call, indices_0);
+
+  // make the outer spatial loop
+  for (int i = layout->OutputDim() - 1; i >= 0; i--) {
+    body = For(indices_0[i].as<Var>().value(), 0, layout->OutputShape()[i],
+               ForKind::kParallel, body);
+  }
+
+  return body;
+}
+
+LayoutMap FinalizeReducerOpNode::InferLayout(const LayoutInferArgs &T,
+                                             InferLevel level) const {
+  LayoutMap layout_map;
+  layout_map.Set(reducer, T.layout_map.Get(reducer).value());
+  return layout_map;
+}
+
+TileOperator FinalizeReducerOpNode::Clone() const {
+  auto node = make_object<FinalizeReducerOpNode>(*this);
+  return TileOperator(node);
+}
+
+TIR_REGISTER_TL_OP(FinalizeReducerOp, finalize_reducer)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/finalize_reducer.h b/src/op/finalize_reducer.h
@@ -0,0 +1,46 @@
+// Copyright (c) Tile-AI Corporation.
+// Licensed under the MIT License.
+
+/*!
+ * \file src/op/finalize_reducer.h
+ * \brief Define finalize_reducer operator.
+ */
+
+#ifndef TVM_TL_OP_FINALIZE_REDUCER_H_
+#define TVM_TL_OP_FINALIZE_REDUCER_H_
+
+#include "../transform/layout_reducer.h"
+#include "./operator.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+class FinalizeReducerOpNode : public TileOperatorNode {
+public:
+  tir::Buffer reducer;
+  ReducerOpType op;
+
+  static constexpr const char *_type_key = "tl.FinalizeReducerOp";
+  TVM_DECLARE_FINAL_OBJECT_INFO(FinalizeReducerOpNode, TileOperatorNode);
+
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  static const Op &Get();
+  TileOperator Clone() const;
+};
+
+class FinalizeReducerOp : public TileOperator {
+public:
+  TVM_DEFINE_OBJECT_REF_METHODS(FinalizeReducerOp, TileOperator,
+                                FinalizeReducerOpNode);
+  TVM_DLL FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap);
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif //  TVM_TL_OP_FINALIZE_REDUCER_H_
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
@@ -124,6 +124,12 @@ void ParallelLoopNestVisitor::VisitStmt_(const ForNode *op) {
   p->loop_vars_.push_back(
       IterVar(Range(op->min, op->extent), op->loop_var, IterVarType::kDataPar));
   p->analyzer_.Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+  auto reducer_info_map =
+      op->annotations.Get(attr::kReducerInfo)->as<Map<Var, ReducerInfo>>();
+  if (reducer_info_map) {
+    for (auto &&[buffer, info] : reducer_info_map.value())
+      p->reducer_info_map_.Set(buffer, info);
+  }
   StmtExprVisitor::VisitStmt_(op);
-  auto reducer_info_map =
-      op->annotations.Get(attr::kReducerInfo)->as<Map<Var, ReducerInfo>>();
-  if (reducer_info_map) {
-    for (auto &&[buffer, info] : reducer_info_map.value())
-      p->reducer_info_map_.Set(buffer, info);
-  }
-  StmtExprVisitor::VisitStmt_(op);
+  if (auto obj = op->annotations.Get(attr::kReducerInfo)) {
+    if (auto reducer_info_map = obj.value().as<Map<Var, ReducerInfo>>()) {
+      for (auto &&[buffer, info] : reducer_info_map.value()) {
+        p->reducer_info_map_.Set(buffer, info);
+      }
+    }
+  }
+  StmtExprVisitor::VisitStmt_(op);
-  auto reducer_info_map =
-      op->annotations.Get(attr::kReducerInfo)->as<Map<Var, ReducerInfo>>();
-  if (reducer_info_map) {
-    for (auto &&[buffer, info] : reducer_info_map.value())
-      p->reducer_info_map_.Set(buffer, info);
-  }
-  StmtExprVisitor::VisitStmt_(op);
+  if (op->annotations.count(attr::kReducerInfo)) {
+    auto reducer_info_map =
+        op->annotations.Get(attr::kReducerInfo)->as<Map<Var, ReducerInfo>>();
+    if (reducer_info_map) {
+      for (auto &&[buffer, info] : reducer_info_map.value())
+        p->reducer_info_map_.Set(buffer, info);
+    }
+  }
+  StmtExprVisitor::VisitStmt_(op);
-  auto reducer_info_map =
-      op->annotations.Get(attr::kReducerInfo)->as<Map<Var, ReducerInfo>>();
-  if (reducer_info_map) {
-    for (auto &&[buffer, info] : reducer_info_map.value())
-      p->reducer_info_map_.Set(buffer, info);
-  }
-  StmtExprVisitor::VisitStmt_(op);
+  if (auto obj = op->annotations.Get(attr::kReducerInfo)) {
+    if (auto reducer_info_map = obj.value().as<Map<Var, ReducerInfo>>()) {
+      for (auto &&[buffer, info] : reducer_info_map.value()) {
+        p->reducer_info_map_.Set(buffer, info);
+      }
+    }
+  }
+  StmtExprVisitor::VisitStmt_(op);
-  auto reducer_info_map =
-      op->annotations.Get(attr::kReducerInfo)->as<Map<Var, ReducerInfo>>();
-  if (reducer_info_map) {
-    for (auto &&[buffer, info] : reducer_info_map.value())
-      p->reducer_info_map_.Set(buffer, info);
-  }
-  StmtExprVisitor::VisitStmt_(op);
+  if (op->annotations.count(attr::kReducerInfo)) {
+    auto reducer_info_map =
+        op->annotations.Get(attr::kReducerInfo)->as<Map<Var, ReducerInfo>>();
+    if (reducer_info_map) {
+      for (auto &&[buffer, info] : reducer_info_map.value())
+        p->reducer_info_map_.Set(buffer, info);
+    }
+  }
+  StmtExprVisitor::VisitStmt_(op);
 }
 
@@ -202,6 +208,11 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   Buffer source_buffer, read_source_buffer;
   for (const auto &[buffer, indices] : indice_map_) {
     if (T.layout_map.count(buffer)) {
+      // skip reducers with rep=ALL
+      if (auto info = reducer_info_map_.Get(buffer->data);
+          info && info.value()->rep == ReducerRepType::ALL)
+        continue;
+
       auto frag = T.layout_map[buffer].as<Fragment>().value();
       if (buffer_is_write_.count(buffer)) {
         source_buffer = buffer;
@@ -298,6 +309,16 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
           IfBufferRemapLoopGenerator::run(root_, T.buffer_remap, T.layout_map);
       int vector_size = GetVectorizeSize(maybe_remapped_root_);
 
+      PrimExpr loop_total_size = 1;
+      for (Stmt l = root_; l.as<For>().has_value();
+           l = l.as<For>().value()->body)
+        loop_total_size = loop_total_size * l.as<For>().value()->extent;
+      while (!analyzer_.CanProve(
+                 floormod(loop_total_size,
+                          T.thread_bounds->extent * vector_size) == 0) &&
+             vector_size > 1)
+        vector_size /= 2;
+
       // Check if coalesced_width is defined
       if (auto coalesced_width =
               root_->annotations.Get(tl::attr::coalesced_width)) {
@@ -343,11 +364,6 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   for (const auto &[buffer, _] : indice_map_) {
     if (T.layout_map.count(buffer)) {
       auto fragment = T.layout_map[buffer].as<Fragment>().value();
-      // TODO: Add thread checks for replicated cases
-      // need to wildcard match the rhs with lhs
-      if (!is_one(loop_layout_->ReplicateExtent()) ||
-          !is_one(fragment->ReplicateExtent()))
-        continue;
       auto vars =
           loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
       if (!ProveFragmentContains(loop_layout_, fragment, vars,

diff --git a/src/op/parallel.h b/src/op/parallel.h
@@ -10,7 +10,8 @@
 #include <tvm/tir/stmt_functor.h>
 
 #include "../layout/layout.h"
-#include "operator.h"
+#include "../transform/layout_reducer.h"
+#include "./operator.h"
 
 namespace tvm {
 namespace tl {
@@ -112,6 +113,8 @@ class ParallelOpNode : public TileOperatorNode {
   Array<IterVar> loop_vars_;
   // Analyzer for simplifying and analyzing expressions, mutable for lazy use.
   mutable arith::Analyzer analyzer_;
+  // Mapping from buffer to reducer info.
+  Map<Var, ReducerInfo> reducer_info_map_;
 };
 
 class ParallelOp : public TileOperator {

diff --git a/src/op/reduce.cc b/src/op/reduce.cc
@@ -13,6 +13,7 @@
 
 #include "../layout/utils.h"
 #include "../op/parallel.h"
+#include "../target/utils.h"
 #include "../transform/loop_partition.h"
 #include "tir/transforms/ir_utils.h"
 
@@ -237,9 +238,8 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       int reducing_threads = (*extent) * (*scale);
       std::stringstream ss;
 
-      bool has_arch = T.target->attrs.count("arch") > 0;
       auto thread_offset = T.thread_bounds->min;
-      if (has_arch && Downcast<String>(T.target->attrs["arch"]) == "sm_90") {
+      if (TargetIsHopper(T.target)) {
         auto all_threads = T.thread_bounds->extent;
         ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
            << reducing_threads << ", " << (*scale) << ", " << thread_offset

diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
@@ -1134,10 +1134,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::sync_grid())) {
     this->need_cooperative_groups_ = true;
     this->PrintIndent();
-    this->stream << "cooperative_groups::grid_group grid = "
-                    "cooperative_groups::this_grid();\n";
-    this->PrintIndent();
-    this->stream << "grid.sync();\n";
+    this->stream << "cooperative_groups::this_grid().sync();\n";
   } else if (op->op.same_as(tl::loop_break())) {
     this->PrintIndent();
     this->stream << "break;\n";