tile-ai · LeiWang1999 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/src/layout/layout.cc b/src/layout/layout.cc
@@ -102,10 +102,24 @@ Array<PrimExpr> LayoutNode::OutputShape() const {
   for (size_t i = 0; i < ret.size(); i++) {
     auto ist = analyzer.int_set(forward_index_[i] + 1);
     if (arith::is_neg_inf(ist.min()) && arith::is_pos_inf(ist.max())) {
-      // X-OR Expression
-      ret.Set(i, input_size_[i]);
+      // Analyzer couldn't form an IntervalSet (e.g. bitwise ops).
+      // Fall back to ConstIntBound to derive a safe extent.
+      auto cib = analyzer.const_int_bound(forward_index_[i]);
+      if (cib->min_value != arith::ConstIntBound::kNegInf &&
+          cib->max_value != arith::ConstIntBound::kPosInf &&
+          cib->min_value >= 0) {
+        // extent = max - min + 1, using 64-bit integer literal
+        ret.Set(i, Integer(cib->max_value - cib->min_value + 1));
+      } else {
+        // Last-resort conservative fallback to avoid OOB/crash
+        // Prefer to keep dimension from known input_size_ if available.
+        if (i < input_size_.size()) {
+          ret.Set(i, input_size_[i]);
+        } else {
+          ret.Set(i, Integer(1));
+        }
+      }
     } else {
-      // CHECK(is_one(ist.min())) << ist.min();
       ret.Set(i, ist.max());
     }
   }
@@ -282,10 +296,156 @@ std::pair<Layout, arith::IterMapLevel> LayoutNode::InverseWithLevel() const {
   return {Layout(outputs_shape, backward_index), level};
 }
 
+Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
+                           arith::Analyzer *analyzer) const {
+  // Fast path: if shape is the same, return the original layout
+  if (StructuralEqual()(InputShape(), shape)) {
+    return ffi::GetRef<Layout>(this);
+  }
+
+  // Step 1. Prove the product of InputShape is equal to the product of shape
+  PrimExpr input_shape_product = Integer(1);
+  for (const auto &dim : InputShape()) {
+    input_shape_product *= dim;
+  }
+  PrimExpr shape_product = Integer(1);
+  for (const auto &dim : shape) {
+    shape_product *= dim;
+  }
+
+  if (analyzer) {
+    ICHECK(analyzer->CanProveEqual(input_shape_product, shape_product))
+        << "InputShape() = " << InputShape() << " shape = " << shape;
+  } else {
+    arith::Analyzer local_analyzer;
+    ICHECK(local_analyzer.CanProveEqual(input_shape_product, shape_product))
+        << "InputShape() = " << InputShape() << " shape = " << shape;
+  }
+
+  // Step 2. Create new forward indices by reshaping
+  // For each dimension in the new shape, we create a placeholder variable
+  Array<Var> new_vars;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    new_vars.push_back(InputPlaceholder(i));
+  }
+  // Step 3. Compute the flat index from new shape indices
+  // flat_index = k0 * (s1 * s2 * ...) + k1 * (s2 * s3 * ...) + ... + kn
+  PrimExpr flat_index = Integer(0);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < shape.size(); ++j) {
+      stride = stride * shape[j];
+    }
+    flat_index = flat_index + new_vars[i] * stride;
+  }
+  // Step 4. Convert flat index back to original shape indices
+  // For original shape [s0, s1, ..., sm]:
+  // i0 = flat_index // (s1 * s2 * ... * sm)
+  // i1 = (flat_index % (s1 * s2 * ... * sm)) // (s2 * s3 * ... * sm)
+  // ...
+  Array<PrimExpr> original_indices;
+  PrimExpr remaining = flat_index;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < InputShape().size(); ++j) {
+      stride = stride * InputShape()[j];
+    }
+    original_indices.push_back(floordiv(remaining, stride));
+    remaining = floormod(remaining, stride);
+  }
+  // Step 5. Substitute original indices into forward_index_
+  Array<PrimExpr> new_forward_index;
+  for (const auto &fwd_expr : forward_index_) {
+    PrimExpr substituted = fwd_expr;
+    // Replace each InputPlaceholder(i) with original_indices[i]
+    for (size_t i = 0; i < InputShape().size(); ++i) {
+      substituted =
+          Substitute(substituted, {{InputPlaceholder(i), original_indices[i]}});
+    }
+    new_forward_index.push_back(substituted);
+  }
+  return Layout(shape, new_forward_index);
+}
+
+Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
+                             arith::Analyzer *analyzer) const {
+  // Fast path: identical input shape, return self
+  if (StructuralEqual()(InputShape(), shape)) {
+    return ffi::GetRef<Fragment>(this);
+  }
+
+  // 1) Prove total number of elements remains the same
+  PrimExpr input_prod = Integer(1);
+  for (const auto &d : InputShape())
+    input_prod *= d;
+  PrimExpr shape_prod = Integer(1);
+  for (const auto &d : shape)
+    shape_prod *= d;
+
+  if (analyzer) {
+    ICHECK(analyzer->CanProveEqual(input_prod, shape_prod))
+        << "InputShape() = " << InputShape() << " shape = " << shape
+        << " input fragment layout is = " << DebugOutput();
+  } else {
+    arith::Analyzer local_analyzer;
+    ICHECK(local_analyzer.CanProveEqual(input_prod, shape_prod))
+        << "InputShape() = " << InputShape() << " shape = " << shape;
+  }
+
+  // 2) Build flat index from new-shape indices
+  Array<Var> new_vars;
+  new_vars.reserve(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i)
+    new_vars.push_back(InputPlaceholder(i));
+
+  PrimExpr flat = Integer(0);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < shape.size(); ++j)
+      stride = stride * shape[j];
+    flat = flat + new_vars[i] * stride;
+  }
+
+  // 3) Recover original indices from flat index
+  Array<PrimExpr> orig_indices;
+  PrimExpr remain = flat;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < InputShape().size(); ++j)
+      stride = stride * InputShape()[j];
+    orig_indices.push_back(floordiv(remain, stride));
+    remain = floormod(remain, stride);
+  }
+
+  // 4) Substitute old placeholders with expressions of new indices
+  Array<PrimExpr> new_forward_index;
+  for (const auto &e : forward_index_) {
+    PrimExpr cur = e;
+    for (size_t i = 0; i < InputShape().size(); ++i) {
+      cur = Substitute(cur, {{InputPlaceholder(i), orig_indices[i]}});
+    }
+    new_forward_index.push_back(cur);
+  }
+
+  PrimExpr new_forward_thread = forward_thread_;
+  for (size_t i = 0; i < InputShape().size(); ++i) {
+    new_forward_thread = Substitute(new_forward_thread,
+                                    {{InputPlaceholder(i), orig_indices[i]}});
+  }
+
+  Fragment reshaped(shape, new_forward_index, new_forward_thread,
+                    ReplicateExtent(), std::nullopt);
+  if (thread_range_.defined()) {
+    reshaped = reshaped->BindThreadRange(thread_range_);
+  }
+  return reshaped;
+}
+
 Layout LayoutNode::Inverse() const {
   auto inverse_result = InverseWithLevel();
   return std::move(inverse_result.first);
 }
+
 PrimExpr infer_fragment_index(const Map<Var, Range> &input_iters,
                               const PrimExpr &forward_thread,
                               arith::Analyzer *analyzer) {

diff --git a/src/layout/layout.h b/src/layout/layout.h
@@ -41,6 +41,10 @@ class LayoutNode : public Object {
   virtual Array<PrimExpr> Forward(const Array<PrimExpr> &vars) const;
 
   virtual Layout Inverse() const;
+
+  virtual Layout Reshape(const Array<PrimExpr> &shape,
+                         arith::Analyzer *analyzer) const;
+
   virtual std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const;
 
   virtual std::string DebugOutput() const;
@@ -81,6 +85,9 @@ class FragmentNode : public LayoutNode {
   Array<PrimExpr> GetForwardVars() const final;
 
   Layout Inverse() const final;
+
+  Layout Reshape(const Array<PrimExpr> &shape, arith::Analyzer *analyzer) const;
+
   std::pair<Layout, arith::IterMapLevel> InverseWithLevel() const final;
 
   PrimExpr ThreadExtent() const;

diff --git a/src/op/reduce.cc b/src/op/reduce.cc
@@ -14,17 +14,62 @@
 #include "../op/parallel.h"
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
+#include "region.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
+// Normalize an argument (BufferRegion/BufferLoad/tl.region)
+// to BufferRegion so Reduce can uniformly consume regions.
+static BufferRegion NormalizeToBufferRegion(const PrimExpr &arg,
+                                            const BufferMap &vmap) {
+  // Case 1: Already a BufferRegion
+  if (arg->IsInstance<BufferRegionNode>()) {
+    return Downcast<BufferRegion>(arg);
+  }
+
+  // Case 2: BufferLoad — convert indices to ranges (Ramp -> lanes, else
+  // extent=1)
+  if (const auto *load = arg.as<BufferLoadNode>()) {
+    Array<Range> ranges;
+    for (const PrimExpr &index : load->indices) {
+      if (const auto *ramp = index.as<RampNode>()) {
+        ICHECK(ramp->stride.as<IntImmNode>()) << "Ramp stride must be IntImm";
+        ICHECK_EQ(ramp->stride.as<IntImmNode>()->value, 1)
+            << "Only stride-1 Ramp is supported in region conversion";
+        ICHECK(ramp->lanes.as<IntImmNode>())
+            << "Scalable vector lanes not supported in region conversion";
+        ranges.push_back(Range::FromMinExtent(ramp->base, ramp->lanes));
+      } else {
+        ranges.push_back(Range::FromMinExtent(index, 1));
+      }
+    }
+    return BufferRegion(load->buffer, ranges);
+  }
+
+  // Case 3: Call nodes (only tl.region)
+  if (const auto *call = arg.as<CallNode>()) {
+    // tl.region(...) — reconstruct via RegionOp
+    if (call->op.same_as(RegionOp::Get())) {
+      RegionOp region(call->args, vmap);
+      return BufferRegion(region->GetBuffer(), region->GetRanges());
+    }
+  }
+
+  LOG(FATAL) << "Unsupported argument for BufferRegion in reduce: " << arg;
+  throw; // Unreachable
+}
+
 ReduceOp::ReduceOp(Array<PrimExpr> args, BufferMap vmap) {
   ObjectPtr<ReduceOpNode> node = tvm::ffi::make_object<ReduceOpNode>();
-  node->src = vmap[GetVarFromAccessPtr(args[0])];
-  node->dst = vmap[GetVarFromAccessPtr(args[1])];
+  // Accept BufferRegion/BufferLoad/tl.region for src/dst
+  node->srcRegion_ = NormalizeToBufferRegion(args[0], vmap);
+  node->dstRegion_ = NormalizeToBufferRegion(args[1], vmap);
+  node->src = node->srcRegion_->buffer;
+  node->dst = node->dstRegion_->buffer;
   std::string reduce_type = args[2].as<StringImm>().value()->value;
   node->dim = args[3].as<IntImm>().value()->value;
   node->type = ReduceType(reduce_type);
@@ -369,6 +414,7 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
                                     InferLevel level) const {
   if (level >= InferLevel::kStrict)
     return {};
+
   if (src.scope() == "local.fragment" && dst.scope() == "local.fragment" &&
       T.layout_map.count(src)) {
     auto src_layout = T.layout_map[src].as<Fragment>().value();
@@ -422,6 +468,7 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
         Fragment(dst->shape, {}, thd, dest_buffer_rep_extent, std::nullopt)
             ->CondenseReplicateVar()
             ->BindThreadRange(T.thread_bounds);
+
     if (!T.layout_map.count(dst))
       return {{dst, dst_layout}};
     else {

diff --git a/src/op/reduce.h b/src/op/reduce.h
@@ -82,9 +82,11 @@ class ReduceType : public ObjectRef {
 class ReduceOpNode : public TileOperatorNode {
 public:
   tir::Buffer src, dst; ///< Source and destination buffers
-  int dim;              ///< Dimension to reduce along
-  ReduceType type;      ///< Type of reduction operation
-  bool clear;           ///< Whether to clear destination before reduction
+  // Optional: keep the original regions used to construct this op
+  BufferRegion srcRegion_, dstRegion_;
+  int dim;         ///< Dimension to reduce along
+  ReduceType type; ///< Type of reduction operation
+  bool clear;      ///< Whether to clear destination before reduction
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ReduceOp", ReduceOpNode,
                                     TileOperatorNode);
@@ -94,6 +96,8 @@ class ReduceOpNode : public TileOperatorNode {
     refl::ObjectDef<ReduceOpNode>()
         .def_ro("src", &ReduceOpNode::src)
         .def_ro("dst", &ReduceOpNode::dst)
+        .def_ro("srcRegion", &ReduceOpNode::srcRegion_)
+        .def_ro("dstRegion", &ReduceOpNode::dstRegion_)
         .def_ro("dim", &ReduceOpNode::dim)
         .def_ro("type", &ReduceOpNode::type)
         .def_ro("clear", &ReduceOpNode::clear);