tile-ai
diff --git a/‎setup.py‎
Lines changed: 14 additions & 3 deletions b/‎setup.py‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎src/op/builtin.h‎
Lines changed: 8 additions & 0 deletions b/‎src/op/builtin.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/op/finalize_reducer.cc‎
Lines changed: 56 additions & 0 deletions b/‎src/op/finalize_reducer.cc‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎src/op/finalize_reducer.h‎
Lines changed: 63 additions & 0 deletions b/‎src/op/finalize_reducer.h‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎src/op/parallel.cc‎
Lines changed: 22 additions & 116 deletions b/‎src/op/parallel.cc‎
Lines changed: 22 additions & 116 deletions
@@ -749,9 +749,20 @@ def build_cython(self, ext):
 
     def build_cmake(self, ext):
         """
-        Build a single CMake-based extension.
-
-        :param ext: The extension (an instance of CMakeExtension).
+        Build a single CMake-based extension by generating a CMake config and invoking CMake/Ninja.
+        
+        Generates or updates a config.cmake in the build directory (based on the extension's sourcedir),
+        injecting LLVM/CUDA/ROCm and Python settings, then runs CMake to configure and build the target.
+        When running an in-place build the resulting library is placed under ./tilelang/lib; otherwise the
+        standard extension output directory is used.
+        
+        Parameters:
+            ext: The CMakeExtension to build; its `sourcedir` should contain the TVM/CMake `config.cmake`
+                 template under `3rdparty/tvm/cmake/`.
+        
+        Raises:
+            subprocess.CalledProcessError: If the CMake configuration or build commands fail.
+            OSError: If filesystem operations (read/write) fail.
         """
         # Only setup LLVM if it's enabled
         llvm_config_path = "OFF"
 
@@ -11,6 +11,14 @@
 #include <tvm/ir/transform.h>
 
 namespace tvm {
+/*!
+ * \brief Create the TVM intrinsic that initializes a PTX fence barrier.
+ *
+ * Initializes a PTX fence-style barrier used to coordinate asynchronous memory
+ * operations (for example, TMA/TMA_STORE). Returns the Op representing this
+ * intrinsic for use in TIR lowering and code generation.
+ *
+ */
 namespace tl {
 
 namespace attr {
 
@@ -18,13 +18,50 @@ namespace tl {
 
 using namespace tir;
 
+/**
+ * @brief Construct a FinalizeReducerOp from TL operator arguments and a buffer map.
+ *
+ * Extracts the reducer Buffer from `vmap` using the variable referenced by `args[0]`
+ * and sets the reduction operation type from the integer code in `args[1]`.
+ *
+ * @param args TL operator arguments: expects at least two elements where
+ *             `args[0]` is an access pointer identifying the reducer variable and
+ *             `args[1]` is an integer encoding a `ReducerOpType` (e.g., Sum/Max/Min).
+ * @param vmap Mapping from variables to Buffers used to look up the reducer Buffer.
+ */
 FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args, BufferMap vmap) {
   auto node = make_object<FinalizeReducerOpNode>();
   node->reducer = vmap[GetVarFromAccessPtr(args[0])];
   node->op = (ReducerOpType)*as_const_int(args[1]);
   data_ = std::move(node);
 }
 
+/**
+ * @brief Lower the finalize_reducer TL operator to a TIR statement.
+ *
+ * Lowers the operator that finalizes a reducer by performing a thread-wide AllReduce
+ * across the reducer's output elements and writing the reduced value back into the
+ * reducer buffer. The function:
+ * - Fetches the reducer buffer and expects its layout to be a Fragment.
+ * - Builds index Vars for each output dimension.
+ * - Reads the layout's ReplicateExtent and:
+ *   - if extent == 1, emits a no-op Evaluate(0);
+ *   - otherwise constructs an AllReduce extern call (uses `run_hopper` when the
+ *     compilation target is Hopper) with an optional workspace (allocated via
+ *     T.AddWorkspace when reducing_threads >= 32) and stores the result via
+ *     BufferStore.
+ * - Wraps the store in parallel outer For loops over each output dimension.
+ *
+ * @param T Lowering context containing buffer remapping, layout map, thread bounds,
+ *          target, and helper methods (e.g., AddWorkspace).
+ * @param analyzer Arithmetic analyzer (unused by this implementation but provided
+ *                 for consistency with lowering API).
+ * @return Stmt The lowered TIR statement representing the AllReduce and surrounding loops.
+ *
+ * @note The function ICHECKs that the reducer layout is present and a Fragment,
+ *       and that ReplicateExtent is either 1 or equal to the thread block extent;
+ *       violations cause a fatal check failure.
+ */
 Stmt FinalizeReducerOpNode::Lower(const LowerArgs &T,
                                   arith::Analyzer *analyzer) const {
   auto buffer = T.buffer_remap[reducer];
@@ -81,13 +118,32 @@ Stmt FinalizeReducerOpNode::Lower(const LowerArgs &T,
   return body;
 }
 
+/**
+ * @brief Infer and return the layout mapping for the reducer buffer.
+ *
+ * Copies the existing layout for the reducer from the provided LayoutInferArgs into
+ * a new LayoutMap and returns it. The inference does not modify the layout; it
+ * preserves the reducer's current layout.
+ *
+ * @param T Provides the input layout map from which the reducer's layout is copied.
+ * @param level Unused by this operator; present for API compatibility.
+ * @return LayoutMap A map that contains the reducer buffer mapped to its original layout.
+ */
 LayoutMap FinalizeReducerOpNode::InferLayout(const LayoutInferArgs &T,
                                              InferLevel level) const {
   LayoutMap layout_map;
   layout_map.Set(reducer, T.layout_map.Get(reducer).value());
   return layout_map;
 }
 
+/**
+ * @brief Create a deep copy of this FinalizeReducerOpNode and wrap it as a TileOperator.
+ *
+ * Constructs a new FinalizeReducerOpNode by copying the current node state and returns
+ * a TileOperator that owns the copied node.
+ *
+ * @return TileOperator A TileOperator that contains a deep copy of this node.
+ */
 TileOperator FinalizeReducerOpNode::Clone() const {
   auto node = make_object<FinalizeReducerOpNode>(*this);
   return TileOperator(node);
 
@@ -12,6 +12,69 @@
 #include "../transform/layout_reducer.h"
 #include "./operator.h"
 
+/**
+ * FinalizeReducer operator node for Tile IR.
+ *
+ * Represents a TL-level operator that finalizes a reducer buffer into a
+ * result using a specified reducer operation.
+ *
+ * Public members:
+ * - reducer: the tir::Buffer that holds the intermediate reduction values.
+ * - op: the reducer operation to apply when finalizing values.
+ */
+ 
+/**
+ * Lower this operator to a TIR statement.
+ *
+ * @param T Lowering arguments (buffers, indices, and other lowering context).
+ * @param analyzer Arithmetic analyzer used to simplify expressions during lowering.
+ * @return A tir::Stmt that implements the finalize-reducer semantics for the provided
+ *         lowering context.
+ */
+
+/**
+ * Infer layout mapping for this operator.
+ *
+ * Determines how input and output buffer layouts relate for the finalize-reducer
+ * operator at the given inference level.
+ *
+ * @param T Layout inference arguments (including operand layouts and shapes).
+ * @param level Inference precision level.
+ * @return A LayoutMap describing the inferred layouts.
+ */
+
+/**
+ * Get the singleton Op object representing this operator.
+ *
+ * @return A reference to the Op describing FinalizeReducer.
+ */
+
+/**
+ * Create a deep copy of this operator node as a TileOperator.
+ *
+ * @return A TileOperator handle that is an independent clone of this node.
+ */
+
+/**
+ * Public wrapper for FinalizeReducerOpNode.
+ *
+ * Provides the reference semantics and construction API used by callers.
+ */
+
+/**
+ * Construct a FinalizeReducerOp from TL-level arguments.
+ *
+ * @param args Positional primitive expressions that parameterize the operator
+ *             (e.g., shapes, axis indices). Documented where their meaning is
+ *             not obvious from name or type in call sites.
+ * @param vmap Mapping from operand names to tir::Buffer instances used by this operator.
+ */
+
+/**
+ * Get the Op singleton for the public FinalizeReducerOp handle.
+ *
+ * @return A reference to the Op describing FinalizeReducer.
+ */
 namespace tvm {
 namespace tl {
 
 
@@ -119,6 +119,14 @@ class IfBufferRemapLoopGenerator : public StmtExprMutator {
   Map<Buffer, Layout> layout_map_;
 };
 
+/**
+ * @brief Handle a parallel For node during traversal, collecting loop metadata.
+ *
+ * Visits a parallel loop, asserts the loop is parallel, records a data-parallel
+ * IterVar for the loop, binds the loop variable range into the analyzer scope,
+ * and extracts any reducer information from the loop's annotations into the
+ * visitor's reducer_info_map_. Continues traversal into the loop body.
+ */
 void ParallelLoopNestVisitor::VisitStmt_(const ForNode *op) {
   ICHECK(op->kind == ForKind::kParallel);
   p->loop_vars_.push_back(
@@ -147,19 +155,6 @@ void ParallelLoopNestVisitor::VisitStmt_(const BufferStoreNode *op) {
   StmtExprVisitor::VisitStmt_(op);
 }
 
-/**
- * @brief Visit a BufferLoad node and record/validate index mapping for
- * fragment-local buffers.
- *
- * If the loaded buffer's scope is "local.fragment", this records the load
- * indices in the visitor's indice_map_ when seen for the first time. If an
- * entry already exists, the previously recorded indices are asserted
- * structurally equal to the current indices.
- *
- * This ensures all accesses to the same fragment-local buffer within the
- * parallel loop use a consistent index map. The function then continues
- * standard expression visitation.
- */
 void ParallelLoopNestVisitor::VisitExpr_(const BufferLoadNode *op) {
   if (op->buffer.scope() == "local.fragment") {
     if (p->indice_map_.find(op->buffer) != p->indice_map_.end()) {
@@ -173,91 +168,42 @@ void ParallelLoopNestVisitor::VisitExpr_(const BufferLoadNode *op) {
   StmtExprVisitor::VisitExpr_(op);
 }
 
-/**
- * @brief Construct a ParallelOpNode from a parallel loop nest root.
- *
- * Initializes the node with the given For loop as the root of the parallel
- * operator and immediately runs the internal ParallelLoopNestVisitor to collect
- * loop and buffer access information from the nested body.
- *
- * @param root The root For node representing the parallel loop nest to be
- * analyzed.
- */
 ParallelOpNode::ParallelOpNode(For root) : root_(root), V(this) {
   V.VisitStmt(root);
 }
 
-/**
- * @brief Create a copy of this ParallelOpNode wrapped as a TileOperator.
- *
- * Returns a new TileOperator that holds a deep copy of this ParallelOpNode.
- *
- * @return TileOperator A TileOperator owning a copy of this node.
- */
 TileOperator ParallelOpNode::Clone() const {
   auto op = make_object<ParallelOpNode>(*this);
   return ParallelOp(op);
 }
 
-/**
- * @brief No-op lowering: return the stored root statement unchanged.
- *
- * This implementation does not perform any transformation and returns the
- * operator's original root For statement as-is.
- *
- * @param T Lowering arguments (unused).
- * @return Stmt The original root statement held by this ParallelOpNode.
- */
 Stmt ParallelOpNode::Lower(const LowerArgs &T,
                            arith::Analyzer *analyzer) const {
   return root_;
 }
 
-/**
- * @brief Check whether a buffer is indexed by the loop's canonical (common)
- * iteration variables.
- *
- * Returns true if the recorded index mapping for `buffer` is structurally equal
- * to the sequence of loop iteration variables for this parallel op (i.e., the
- * buffer is accessed using the common access indices of the loop nest).
- *
- * @param buffer The buffer to check.
- * @return true if the buffer's index map equals the loop's iteration variables;
- * false otherwise.
- */
 bool ParallelOpNode::IsCommonAccessIndice(const Buffer &buffer) const {
   auto common_indice = loop_vars_.Map([](const auto &iv) { return iv->var; });
   return StructuralEqual()(indice_map_[buffer], common_indice);
 }
 
-/**
- * @brief Infer buffer layouts for a Parallel operator based on the chosen
- * inference level.
+/*! \brief Infer the layout for parallel operations based on different inference
+ * levels
  *
- * Attempts to compute a consistent LayoutMap for buffers accessed by a parallel
- * loop (root_) using explicit input layouts (T.layout_map), thread bounds
- * (T.thread_bounds), and optional buffer remapping/vectorization information in
- * T. Behavior depends on the supplied InferLevel:
- *  - kStrict: only accept pre-existing loop_layout_ (no inference).
- *  - kCommon: allow inference from explicit buffer fragments when available.
- *  - kFree: attempt more aggressive inference (derive loop partition from
- * read/write fragments, plan partitioning from vectorization/thread bounds, and
- * add predicates to constrain replication when necessary).
+ * The inference level controls how aggressively we try to infer and optimize
+ * layouts:
+ * - kStrict (2): Most conservative level. Only allows explicitly defined
+ * layouts. Returns empty layout map if loop_layout_ is not already defined.
+ *                Used when exact layout control is required.
  *
- * This method may mutate the node's internal state (sets loop_layout_ when
- * inferred and registers predicates via AddPredicate) and consults analyzer_
- * for symbolic proofs.
+ * - kCommon (1): Intermediate level between strict and free.
+ *                Allows common layout patterns while maintaining some
+ * constraints.
  *
- * @param T Container of auxiliary inputs used for inference (buffer_remap,
- * layout_map, and thread_bounds). The function uses T.layout_map for source
- * fragments and T.thread_bounds to bind thread-range information in inferred
- * fragments.
- * @param level Controls inference aggressiveness (kStrict, kCommon, kFree).
- * @return LayoutMap A map of buffers to inferred Fragment layouts for buffers
- * that did not already have layouts in T.layout_map. Returns an empty map when
- *         no inference was performed.
- * @throws LayoutConflictException If a computed loop partition conflicts with
- * an existing buffer fragment (incompatible thread mappings).
+ * - kFree (0):   Most permissive level. Allows maximum optimization freedom.
+ *                Will attempt layout inference even without source buffers.
+ *                Can generate new layouts based on vectorization and thread
+ * bounds. Used when maximum performance optimization is desired.
  */
 LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
                                       InferLevel level) const {
@@ -446,20 +392,6 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   return results;
 }
 
-/**
- * @brief Retrieve the loop's thread predicate with the thread variable
- * substituted.
- *
- * If a predicate is set for this ParallelOpNode, returns a copy of that
- * predicate where the placeholder input (InputPlaceholder(0)) is replaced by
- * the provided thread_var. If no predicate is defined, returns an empty
- * Optional.
- *
- * @param thread_var The thread loop variable to substitute for the predicate's
- * input placeholder.
- * @return Optional<PrimExpr> The substituted predicate expression, or
- * std::nullopt if none is defined.
- */
 Optional<PrimExpr> ParallelOpNode::GetPredicate(Var thread_var) const {
   if (predicate_.defined()) {
     return Substitute(predicate_.value(), {{InputPlaceholder(0), thread_var}});
@@ -468,32 +400,6 @@ Optional<PrimExpr> ParallelOpNode::GetPredicate(Var thread_var) const {
   }
 }
 
-/**
- * @brief Construct the complete fragment layout for a buffer within the
- * parallel loop.
- *
- * Given a buffer referenced inside the parallel loop, return a Fragment that
- * maps the buffer's logical indices to the loop's thread space and replication
- * extent.
- *
- * Detailed behavior:
- * - Precondition: a loop layout (loop_layout_) must be defined.
- * - If the buffer uses the common access indices of the loop, the loop's
- * fragment is returned directly.
- * - Otherwise, the function:
- *   - Computes the buffer's bijective index by appending the flattened
- * replication expression for unused iterators.
- *   - Inverts that bijection to obtain the replication extent of the buffer's
- * index space and combines it with the loop's replication extent to produce the
- *     destination replication extent.
- *   - Builds forward index placeholders for the buffer elements and maps them
- * through the inverted layout and the loop layout to derive the thread binding.
- *   - Returns a Fragment with the computed thread binding and combined
- * replication extent, with replicate variables condensed.
- *
- * @return Fragment The completed fragment describing thread binding and
- * replication extent for `buffer`.
- */
 Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
   ICHECK(loop_layout_.defined());
   if (IsCommonAccessIndice(buffer)) {