[Backend] Optimize membar insertion on hopper

lijinpei · lijinpei · commit 6ca49d1d4d2e · 2025-10-06T10:29:14.000+08:00
- `mbarrier.try_wait` has same effects has bar.
- Don't insert bar between mbarrier arrive/expect-tx/etc.
- Distributed `mbarrier.arrive`'s arrive-count to as much warps as
  possible.
- When all warps participates in `mbarrier.arrive`, don't insert a bar
  between it and previous `wgmma.mma_async` or `stmatrix`.
diff --git a/include/triton/Analysis/Membar.h b/include/triton/Analysis/Membar.h
@@ -19,6 +19,7 @@ struct BlockInfo {
 
   IntervalMapT syncReadIntervals;
   IntervalMapT syncWriteIntervals;
+  IntervalMapT syncAtomicIntervals;
 
   BlockInfo() = default;
 
@@ -30,6 +31,9 @@ struct BlockInfo {
     for (auto &interval : other.syncWriteIntervals)
       syncWriteIntervals[interval.first].insert(interval.second.begin(),
                                                 interval.second.end());
+    for (auto &interval : other.syncAtomicIntervals)
+      syncAtomicIntervals[interval.first].insert(interval.second.begin(),
+                                                 interval.second.end());
     return *this;
   }
 
@@ -39,39 +43,67 @@ struct BlockInfo {
     err << "  Read Intervals:\n";
     for (auto &[interval, ops] : syncReadIntervals) {
       err << "    [" << interval.start() << ", " << interval.end() << "] ";
-      for (auto &op : ops)
-        err << op->getName() << " ";
+      for (auto &op : ops) {
+        op->dump();
+        err << "\n";
+      }
       err << "\n";
     }
     err << "  Write Intervals:\n";
     for (auto &[interval, ops] : syncWriteIntervals) {
       err << "    [" << interval.start() << ", " << interval.end() << "] ";
-      for (auto &op : ops)
-        err << op->getName() << " ";
+      for (auto &op : ops) {
+        op->dump();
+        err << "\n";
+      }
+      err << "\n";
+    }
+    err << "  Atomic Intervals:\n";
+    for (auto &[interval, ops] : syncAtomicIntervals) {
+      err << "    [" << interval.start() << ", " << interval.end() << "] ";
+      for (auto &op : ops) {
+        op->dump();
+        err << "\n";
+      }
       err << "\n";
     }
   }
 
   /// Returns true if intervals in two BlockInfo objects are intersected.
-  bool isIntersected(const BlockInfo &other, MembarFilterFn filter) const {
-    return /*RAW*/ isIntersected(syncWriteIntervals, other.syncReadIntervals,
-                                 filter) ||
-           /*WAR*/
-           isIntersected(syncReadIntervals, other.syncWriteIntervals, filter) ||
-           /*WAW*/
-           isIntersected(syncWriteIntervals, other.syncWriteIntervals, filter);
+  bool isIntersected(const BlockInfo &afterInfo, MembarFilterFn filter) const {
+    // * Atomic, Write, Read
+    // Atomic F, T, T
+    // Write  T, T, T
+    // Rread  T, T, F
+    const auto &a0 = syncAtomicIntervals;
+    const auto &r0 = syncReadIntervals;
+    const auto &w0 = syncWriteIntervals;
+
+    // Note `*this`comes before `afterInfo`.
+    const auto &a1 = afterInfo.syncAtomicIntervals;
+    const auto &r1 = afterInfo.syncReadIntervals;
+    const auto &w1 = afterInfo.syncWriteIntervals;
+
+    auto intersects = [&](const IntervalMapT &s0, const auto &...ss) {
+      return (... || isIntersected(s0, ss, filter));
+    };
+
+    return intersects(a0, w1, r1) || intersects(w0, a1, w1, r1) ||
+           intersects(r0, a1, w1);
   }
 
   /// Clears the intervals because a barrier is inserted.
   void sync() {
     syncReadIntervals.clear();
     syncWriteIntervals.clear();
+    syncAtomicIntervals.clear();
   }
 
   /// Compares two BlockInfo objects.
   bool operator==(const BlockInfo &other) const {
     return syncReadIntervals == other.syncReadIntervals &&
-           syncWriteIntervals == other.syncWriteIntervals;
+           syncWriteIntervals == other.syncWriteIntervals &&
+           syncAtomicIntervals == other.syncAtomicIntervals;
   }
 
   bool operator!=(const BlockInfo &other) const { return !(*this == other); }
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h b/include/triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h
@@ -2,11 +2,23 @@
 #define TRITON_GPU_DIALECT_INTERFACES_H
 
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
 
 // clang-format off
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/IR/OpInterfaces.h.inc"
 #include "triton/Dialect/TritonGPU/IR/AttrInterfaces.h.inc"
 // clang-format on
 
+namespace mlir::MemoryEffects {
+// An atomic read or write on mbarrier:
+// - atomic rmw:
+//   * mbarrier.arrive
+//   * mbarrier.expect_tx
+//   * cp.async.bulk.tensor
+// - atomic cas: mbarrier.try_wait
+// We don'y need to insert a `__syncthreads()` between atomic effects, but we
+// need if they were write effects.
+struct MBarAtomic : public Effect::Base<MBarAtomic> {};
+} // namespace mlir::MemoryEffects
 #endif // TRITON_GPU_DIALECT_INTERFACES_H
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td
@@ -2,6 +2,7 @@
 #define TRITONGPU_OP_INTERFACES
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 def UpcastFpOpInterface : OpInterface<"UpcastFpOpInterface"> {
     let description = [{
@@ -26,4 +27,9 @@ def UpcastFpOpInterface : OpInterface<"UpcastFpOpInterface"> {
     ];
 }
 
+def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
+def SharedMemory : Resource<"::mlir::triton::gpu::SharedMemory">;
+
+def MBarAtomic: MemoryEffect<"::mlir::MemoryEffects::MBarAtomic", SharedMemory, 0, PartialEffect>;
+
 #endif // TRITONGPU_OP_INTERFACES
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -5,6 +5,7 @@ include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td"
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "triton/Dialect/Triton/IR/TritonTypes.td"
 include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
@@ -19,9 +20,6 @@ include "mlir/Interfaces/ViewLikeInterface.td"
 //
 // Interfaces
 //
-def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
-def SharedMemory : Resource<"::mlir::triton::gpu::SharedMemory">;
-
 class TTG_Op<string mnemonic, list<Trait> traits = []> :
     Op<TritonGPU_Dialect, mnemonic,
        !listconcat(traits, [VerifyTensorLayoutsTrait])> {
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -30,6 +30,7 @@ include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
 include "triton/Dialect/Triton/IR/TritonInterfaces.td"
 include "triton/Dialect/Triton/IR/TritonOpInterfaces.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td"
+include "triton/Dialect/TritonGPU/IR/TritonGPUOpInterfaces.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypeInterfaces.td"
 include "mlir/IR/OpBase.td"
@@ -38,8 +39,6 @@ include "mlir/Interfaces/InferTypeOpInterface.td" // SameOperandsAndResultType
 include "mlir/Interfaces/DestinationStyleOpInterface.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 
-def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
-def SharedMemory : Resource<"::mlir::triton::gpu::SharedMemory">;
 def TensorMemory : Resource<"::mlir::triton::nvidia_gpu::TensorMemory">;
 
 class TTNG_Op<string mnemonic, list<Trait> traits = []> :
@@ -170,7 +169,7 @@ def TTNG_BarrierExpectOp : TTNG_Op<"barrier_expect"> {
 
   let hasVerifier = 1;
   let arguments = (ins
-    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$alloc,
+    Arg<TTG_MemDescType, "", [MBarAtomic]>:$alloc,
     I32Attr:$size,
     I1:$pred
   );
@@ -198,7 +197,7 @@ def TTNG_WaitBarrierOp : TTNG_Op<"wait_barrier", [AttrSizedOperandSegments]> {
   }];
 
   let arguments = (ins
-    Arg<TTG_MemDescType, "", [MemRead<SharedMemory>, MemWrite<SharedMemory>]>:$alloc,
+    Arg<TTG_MemDescType, "", [MBarAtomic]>:$alloc,
     I32:$phase,
     Optional<I1>:$pred,
     Variadic<TTG_MemDescType>:$deps
@@ -245,7 +244,7 @@ def TTNG_ArriveBarrierOp : TTNG_Op<"arrive_barrier"> {
   }];
 
   let arguments = (ins
-    Arg<TTG_MemDescType, "", [MemRead<SharedMemory>, MemWrite<SharedMemory>]>:$alloc,
+    Arg<TTG_MemDescType, "", [MBarAtomic]>:$alloc,
     I32Attr:$count,
     Optional<I1>:$pred
   );
@@ -266,7 +265,7 @@ def TTNG_ArriveBarrierOp : TTNG_Op<"arrive_barrier"> {
 def TTNG_AsyncCopyMbarrierArriveOp : TTNG_Op<"async_copy_mbarrier_arrive"> {
   let summary = "arrive on mbarrier once all previously issued copies are completed";
   let arguments = (ins
-    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$barrier,
+    Arg<TTG_MemDescType, "", [MBarAtomic]>:$barrier,
     UnitAttr:$noIncrement
   );
   let assemblyFormat = "$barrier attr-dict `:` qualified(type($barrier))";
@@ -288,7 +287,7 @@ def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local">
   let arguments = (ins
     Arg<TT_TensorDescType, "", [MemRead<GlobalMemory>]>:$desc,
     Variadic<I32>:$coord,
-    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$barrier,
+    Arg<TTG_MemDescType, "", [MBarAtomic]>:$barrier,
     Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$result,
     I1:$pred,
     DefaultValuedAttr<TT_CacheModifierAttr, "triton::CacheModifier::NONE">:$cache,
@@ -362,7 +361,7 @@ def TTNG_AsyncTMAGatherOp : TTNG_Op<"async_tma_gather"> {
     Arg<TT_TensorDescType, "", [MemRead<GlobalMemory>]>:$desc,
     RankedTensorOf<[I32]>:$x_offsets,
     I32:$y_offset,
-    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$barrier,
+    Arg<TTG_MemDescType, "", [MBarAtomic]>:$barrier,
     Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$result,
     I1:$pred
   );
diff --git a/lib/Analysis/Membar.cpp b/lib/Analysis/Membar.cpp
@@ -1,6 +1,7 @@
 #include "triton/Analysis/Membar.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
+#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -165,7 +166,8 @@ void MembarAnalysis::insertBarrier(Operation *op, OpBuilder *builder) {
 void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
                             FuncBlockInfoMapT *funcBlockInfoMap,
                             OpBuilder *builder) {
-  if (isa<gpu::BarrierOp, triton::gpu::LocalBarrierOp>(op)) {
+  if (isa<gpu::BarrierOp, triton::gpu::LocalBarrierOp,
+          triton::nvidia_gpu::WaitBarrierOp>(op)) {
     // If the current op is a barrier, we sync previous reads and writes
     blockInfo->sync();
     return;
@@ -210,6 +212,12 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
                     .syncReadIntervals[allocation->getAllocatedInterval(
                         bufferId)]
                     .insert(op);
+              else if (isa<MemoryEffects::MBarAtomic>(
+                           effectInstance.getEffect()))
+                curBlockInfo
+                    .syncAtomicIntervals[allocation->getAllocatedInterval(
+                        bufferId)]
+                    .insert(op);
             }
           }
         }
@@ -244,7 +252,8 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
     }
 
     if (!curBlockInfo.syncReadIntervals.empty() ||
-        !curBlockInfo.syncWriteIntervals.empty()) {
+        !curBlockInfo.syncWriteIntervals.empty() ||
+        !curBlockInfo.syncAtomicIntervals.empty()) {
       llvm::report_fatal_error(
           "scratch buffer operations should not have any shared memory "
           "dependencies");
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
@@ -46,12 +46,6 @@ include "TritonAMDGPUAttrDefs.td"
 class TT_AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
     Op<TritonAMDGPU_Dialect, mnemonic, !listconcat(traits, [])>;
 
-//
-// Interfaces
-//
-def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
-def SharedMemory : Resource<"::mlir::triton::gpu::SharedMemory">;
-
 //===----------------------------------------------------------------------===//
 // ExtractSliceOp
 //===----------------------------------------------------------------------===//
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/BarrierOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/BarrierOpToLLVM.cpp
@@ -233,24 +233,38 @@ struct ArriveBarrierOpConversion
   LogicalResult
   matchAndRewrite(triton::nvidia_gpu::ArriveBarrierOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    // TODO: Add phase result as needed.
     std::stringstream ptxAsm;
-    ptxAsm << "@$0 mbarrier.arrive.shared::cta.b64 _, [$1]";
-    if (op.getCount() > 1) {
-      ptxAsm << ", " << op.getCount();
-    }
-    ptxAsm << ";";
+    ptxAsm << "@$0 mbarrier.arrive.shared::cta.b64 _, [$1], $2;";
 
     TritonLLVMOpBuilder b(op.getLoc(), rewriter);
-    Value id = getThreadId(rewriter, op.getLoc());
-    Value pred = b.icmp_eq(id, b.i32_val(0));
+    Value pred = LLVM::NVIDIA::createElectPredicate(op.getLoc(), rewriter);
     if (op.getPred())
       pred = b.and_(pred, adaptor.getPred());
 
+    // Distribute arrive-count equally among participating warps.
+    int count = op.getCount();
+    int numWarps = triton::gpu::lookupNumWarps(op);
+    int countPerWarp = count / numWarps;
+    int remainderCount = count % numWarps;
+    auto [_, warpId] = getLaneAndWarpId(rewriter, op.getLoc());
+    Value remPred = b.icmp_ult(warpId, b.i32_val(remainderCount));
+    if (countPerWarp < 1) {
+      pred = b.and_(pred, remPred);
+    }
+    Value countVal;
+    if (remainderCount) {
+      countVal = b.select(remPred, b.i32_val(countPerWarp + 1),
+                          b.i32_val(countPerWarp));
+    } else {
+      countVal = b.i32_val(countPerWarp);
+    }
+
     PTXBuilder ptxBuilder;
     SmallVector<PTXBuilder::Operand *, 2> operands = {
         ptxBuilder.newOperand(pred, "b"),
-        ptxBuilder.newOperand(adaptor.getAlloc(), "r")};
+        ptxBuilder.newOperand(adaptor.getAlloc(), "r"),
+        ptxBuilder.newOperand(countVal, "r"),
+    };
 
     auto arriveOp = *ptxBuilder.create<>(ptxAsm.str());
     arriveOp(operands, /*onlyAttachMLIRArgs=*/true);
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TritonGPUToLLVM.cpp
@@ -68,6 +68,19 @@ class TritonLLVMConversionTarget : public ConversionTarget {
   }
 };
 
+bool membarFilter(Operation *beforeOp, Operation *afterOp) {
+  if (isa<triton::nvidia_gpu::WarpGroupDotOp, triton::gpu::LocalStoreOp>(
+          beforeOp)) {
+    if (auto mbarArriveOp =
+            dyn_cast<triton::nvidia_gpu::ArriveBarrierOp>(afterOp)) {
+      auto numWarps = triton::gpu::lookupNumWarps(afterOp);
+      auto numArrive = mbarArriveOp.getCount();
+      return numArrive >= numWarps;
+    }
+  }
+  return false;
+}
+
 struct ConvertTritonGPUToLLVM
     : public triton::impl::ConvertTritonGPUToLLVMBase<ConvertTritonGPUToLLVM> {
   using ConvertTritonGPUToLLVMBase::ConvertTritonGPUToLLVMBase;
@@ -86,7 +99,7 @@ struct ConvertTritonGPUToLLVM
     ModuleAllocation allocation(
         mod, mlir::triton::nvidia_gpu::getNvidiaAllocationAnalysisScratchSizeFn(
                  targetInfo));
-    ModuleMembarAnalysis membarPass(&allocation);
+    ModuleMembarAnalysis membarPass(&allocation, membarFilter);
     membarPass.run();
 
     mlir::LowerToLLVMOptions option(context);