triton-lang · Mogball · May 1, 2025 · Apr 15, 2025 · Apr 16, 2025 · Apr 16, 2025
@@ -718,89 +718,16 @@ void storeDistributedToShared(
     RewriterBase &rewriter, const TargetInfoBase &target,
     std::pair<size_t, Type> *const llvmOpCount = nullptr);
 
-inline SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
-                                           RewriterBase &rewriter) {
-  assert(bool(llvmStruct) && "can not unpack null values");
-  if (llvmStruct.getType().isIntOrIndexOrFloat() ||
-      isa<triton::PointerType>(llvmStruct.getType()) ||
-      isa<LLVM::LLVMPointerType>(llvmStruct.getType()))
-    return {llvmStruct};
-  ArrayRef<Type> types =
-      cast<LLVM::LLVMStructType>(llvmStruct.getType()).getBody();
-  SmallVector<Value> results(types.size());
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  for (unsigned i = 0; i < types.size(); ++i) {
-    Type type = types[i];
-    results[i] = b.extract_val(type, llvmStruct, i);
-  }
-  return results;
-}
-
-inline Value packLLElements(Location loc,
-                            const LLVMTypeConverter *typeConverter,
-                            ValueRange resultVals, RewriterBase &rewriter,
-                            Type type) {
-  auto structType =
-      dyn_cast<LLVM::LLVMStructType>(typeConverter->convertType(type));
-  if (!structType) {
-    assert(resultVals.size() == 1);
-    return *resultVals.begin();
-  }
-
-  auto elementTypes = structType.getBody();
-  if (elementTypes.size() != resultVals.size()) {
-    emitError(loc) << " size mismatch when packing elements for LLVM struct"
-                   << " expected " << elementTypes.size() << " but got "
-                   << resultVals.size();
-  }
-  Value llvmStruct = rewriter.create<LLVM::UndefOp>(loc, structType);
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  for (const auto &v : llvm::enumerate(resultVals)) {
-    if (!v.value()) {
-      emitError(loc)
-          << "cannot insert null values into struct, but tried to insert"
-          << v.value();
-    }
-    if (v.value().getType() != elementTypes[v.index()]) {
-      LDBG("type " << type << " structType " << structType);
-      LDBG("value " << v.value());
-      emitError(loc) << "invalid element type in packLLElements. Expected "
-                     << elementTypes[v.index()] << " but got "
-                     << v.value().getType();
-    }
-    llvmStruct = b.insert_val(structType, llvmStruct, v.value(), v.index());
-  }
-  return llvmStruct;
-}
+SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
+                                    RewriterBase &rewriter);
 
-inline SmallVector<Value> unpackLLVector(Location loc, Value llvmVec,
-                                         RewriterBase &rewriter) {
-  assert(bool(llvmVec) && "cannot unpack null value");
-  if (llvmVec.getType().isIntOrIndexOrFloat() ||
-      isa<triton::PointerType>(llvmVec.getType()) ||
-      isa<LLVM::LLVMPointerType>(llvmVec.getType()))
-    return {llvmVec};
+Value packLLElements(Location loc, const LLVMTypeConverter *typeConverter,
+                     ValueRange resultVals, RewriterBase &rewriter, Type type);
 
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  SmallVector<Value> results;
-  for (int i = 0; i < cast<VectorType>(llvmVec.getType()).getNumElements();
-       i++) {
-    results.push_back(b.extract_element(llvmVec, b.i32_val(i)));
-  }
-  return results;
-}
+SmallVector<Value> unpackLLVector(Location loc, Value llvmVec,
+                                  RewriterBase &rewriter);
 
-inline Value packLLVector(Location loc, ValueRange vals,
-                          RewriterBase &rewriter) {
-  assert(vals.size() > 0);
-  auto vecType = vec_ty(vals[0].getType(), vals.size());
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  Value vec = b.undef(vecType);
-  for (int i = 0; i < vals.size(); i++) {
-    vec = b.insert_element(vec, vals[i], b.i32_val(i));
-  }
-  return vec;
-}
+Value packLLVector(Location loc, ValueRange vals, RewriterBase &rewriter);
 
 inline bool
 isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,

@@ -45,6 +45,7 @@ class WarpSchedule {
     ArrayRef<Operation *> getOps() const { return ops; }
 
     void insert(Operation *op) { ops.push_back(op); }
+    void remove(Operation *op) { ops.erase(llvm::find(ops, op)); }
 
   private:
     void setIndex(int idx) { this->idx = idx; }
@@ -62,6 +63,8 @@ class WarpSchedule {
   Partition *addPartition(unsigned stage);
   // Give each partition a new index and order. The indices must be unique.
   void reorderPartitions(ArrayRef<unsigned> order);
+  // Update the op to partition mapping.
+  void updatePartitions();
 
   // Get the partition the op belongs to.
   Partition *getPartition(Operation *op);
@@ -115,6 +118,9 @@ class WarpSchedule {
       scf::ForOp loop, const Partition *partition,
       function_ref<void(OpResult, OpOperand &, unsigned)> callback) const;
 
+  // Debug dump the schedule.
+  LLVM_DUMP_METHOD void dump() const;
+
 private:
   // Partitions are numbered [0, N).
   SmallVector<std::unique_ptr<Partition>> partitions;

@@ -174,7 +174,8 @@ def TritonGPULoadMMASpecialization : Pass<"tritongpu-load-mma-specialization", "
     and async MMAs into separate partitions.
   }];
 
-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"];
 
   let options = [
     Option<"numStages", "num-stages", "int32_t", /*default*/"3",

@@ -100,7 +100,8 @@ DenseMap<Operation *, int> deserializeLatencies(Operation *op);
 Value createScalarAlloc(ImplicitLocOpBuilder &rewriter, Type type,
                         unsigned numBuffers);
 // Create an allocation and init the mbarriers.
-Value createBarrierAlloc(scf::ForOp forOp, int numBarriers);
+Value createBarrierAlloc(scf::ForOp forOp, int numBarriers,
+                         int arriveCount = 1);
 // Create an allocation that can hold distance number of tensor shapes.
 Value createAlloc(scf::ForOp forOp, RankedTensorType ty, Location loc,
                   gpu::SharedEncodingTrait sharedEnc, unsigned distance);

@@ -553,6 +553,86 @@ void storeDistributedToShared(triton::gpu::MemDescType dstTy,
     llvm::report_fatal_error("Failed to emit transfer from register to shared");
 }
 
+SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
+                                    RewriterBase &rewriter) {
+  assert(bool(llvmStruct) && "can not unpack null values");
+  if (llvmStruct.getType().isIntOrIndexOrFloat() ||
+      isa<triton::PointerType>(llvmStruct.getType()) ||
+      isa<LLVM::LLVMPointerType>(llvmStruct.getType()))
+    return {llvmStruct};
+  ArrayRef<Type> types =
+      cast<LLVM::LLVMStructType>(llvmStruct.getType()).getBody();
+  SmallVector<Value> results(types.size());
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+  for (unsigned i = 0; i < types.size(); ++i) {
+    Type type = types[i];
+    results[i] = b.extract_val(type, llvmStruct, i);
+  }
+  return results;
+}
+
+Value packLLElements(Location loc, const LLVMTypeConverter *typeConverter,
+                     ValueRange resultVals, RewriterBase &rewriter, Type type) {
+  auto structType =
+      dyn_cast<LLVM::LLVMStructType>(typeConverter->convertType(type));
+  if (!structType) {
+    assert(resultVals.size() == 1);
+    return *resultVals.begin();
+  }
+
+  auto elementTypes = structType.getBody();
+  if (elementTypes.size() != resultVals.size()) {
+    emitError(loc) << " size mismatch when packing elements for LLVM struct"
+                   << " expected " << elementTypes.size() << " but got "
+                   << resultVals.size();
+    llvm::report_fatal_error(
+        "size mismatch when packing elements for LLVM struct");
+  }
+  Value llvmStruct = rewriter.create<LLVM::UndefOp>(loc, structType);
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+  for (auto [i, value] : llvm::enumerate(resultVals)) {
+    assert(value && "unexpected null value");
+    if (value.getType() != elementTypes[i]) {
+      LDBG("type " << type << " structType " << structType);
+      LDBG("value " << value);
+      emitError(loc) << "invalid element type in packLLElements. Expected "
+                     << elementTypes[i] << " but got " << value.getType();
+      llvm::report_fatal_error(
+          "element type mismatch when packing elements for LLVM struct");
+    }
+    llvmStruct = b.insert_val(structType, llvmStruct, value, i);
+  }
+  return llvmStruct;
+}
+
+SmallVector<Value> unpackLLVector(Location loc, Value llvmVec,
+                                  RewriterBase &rewriter) {
+  assert(bool(llvmVec) && "cannot unpack null value");
+  if (llvmVec.getType().isIntOrIndexOrFloat() ||
+      isa<triton::PointerType>(llvmVec.getType()) ||
+      isa<LLVM::LLVMPointerType>(llvmVec.getType()))
+    return {llvmVec};
+
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+  SmallVector<Value> results;
+  for (int i = 0; i < cast<VectorType>(llvmVec.getType()).getNumElements();
+       i++) {
+    results.push_back(b.extract_element(llvmVec, b.i32_val(i)));
+  }
+  return results;
+}
+
+Value packLLVector(Location loc, ValueRange vals, RewriterBase &rewriter) {
+  assert(vals.size() > 0);
+  auto vecType = vec_ty(vals[0].getType(), vals.size());
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+  Value vec = b.undef(vecType);
+  for (int i = 0; i < vals.size(); i++) {
+    vec = b.insert_element(vec, vals[i], b.i32_val(i));
+  }
+  return vec;
+}
+
 SmallVector<SmallVector<unsigned>> emitOffsetForLayout(Attribute layout,
                                                        RankedTensorType type) {
   MLIRContext *ctx = layout.getContext();

@@ -288,6 +288,7 @@ bool mlir::triton::getDisallowAccMultiBuffer(scf::ForOp forOp) {
 std::pair<OpResult, int64_t>
 mlir::triton::getDefinitionAndDistance(scf::ForOp forOp, Value value) {
   int64_t distance = 0;
+  DenseSet<Value> seen;
   while (auto arg = dyn_cast<BlockArgument>(value)) {
     // Ignore implicit captures.
     if (arg.getOwner() != forOp.getBody())
@@ -297,6 +298,8 @@ mlir::triton::getDefinitionAndDistance(scf::ForOp forOp, Value value) {
       return {nullptr, 0};
     ++distance;
     value = forOp.getYieldedValues()[arg.getArgNumber() - 1];
+    if (!seen.insert(value).second)
+      return {nullptr, 0};
   }
   return {cast<OpResult>(value), distance};
 }
@@ -358,14 +361,15 @@ Value mlir::triton::createScalarAlloc(ImplicitLocOpBuilder &rewriter, Type type,
 }
 
 // Create an allocation and init the mbarriers.
-Value mlir::triton::createBarrierAlloc(scf::ForOp forOp, int numBarriers) {
+Value mlir::triton::createBarrierAlloc(scf::ForOp forOp, int numBarriers,
+                                       int arriveCount) {
   ImplicitLocOpBuilder rewriter(forOp.getLoc(), forOp);
 
   Value barrierAlloc =
       createScalarAlloc(rewriter, rewriter.getI64Type(), numBarriers);
   for (unsigned i = 0; i < numBarriers; i++) {
     Value barrierView = createSingleBufferView(rewriter, barrierAlloc, i);
-    rewriter.create<ttng::InitBarrierOp>(barrierView, 1);
+    rewriter.create<ttng::InitBarrierOp>(barrierView, arriveCount);
   }
   // Invalidate and deallocate the barriers.
   rewriter.setInsertionPointAfter(forOp);

@@ -92,17 +92,19 @@ tt::CoarseSchedule::getOpsInOrder(scf::ForOp forOp) {
   SmallVector<SmallVector<std::tuple<Operation *, int, Cluster>>, 8>
       orderClusters(clusters.size());
   for (auto &op : forOp.getBody()->without_terminator()) {
-    if (opToStageAndCluster.count(&op) == 0) {
+    auto it = opToStageAndCluster.find(&op);
+    if (it == opToStageAndCluster.end()) {
       continue;
     }
-    assert(opToStageAndCluster[&op].first < numStages &&
-           "Op with invalid stage!");
-    int clusterId = *opToStageAndCluster[&op].second;
-    assert(clusterId == std::distance(clusters.begin(),
-                                      opToStageAndCluster[&op].second) &&
+    auto [stage, cluster] = it->second;
+    if (cluster == Cluster{}) {
+      continue;
+    }
+    assert(stage < numStages && "Op with invalid stage!");
+    int clusterId = *cluster;
+    assert(clusterId == std::distance(clusters.begin(), cluster) &&
            "Cluster ID mismatch!");
-    orderClusters[clusterId].push_back(make_tuple(
-        &op, opToStageAndCluster[&op].first, opToStageAndCluster[&op].second));
+    orderClusters[clusterId].push_back(make_tuple(&op, stage, cluster));
   }
   SmallVector<std::tuple<Operation *, int, Cluster>> opsInOrder;
   for (int i = 0; i < orderClusters.size(); i++) {