Skip to content

[mlir][AMDGPU] Improve DPP implementation of subgroup reduction #136804

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 28 commits into
base: main
Choose a base branch
from

Conversation

Muzammiluddin-Syed-ECE
Copy link
Contributor

Following up on PR 133204, we add further improvements to the DPP implementation of subgroup reduce.

ops.

Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
@Muzammiluddin-Syed-ECE Muzammiluddin-Syed-ECE marked this pull request as draft April 23, 2025 03:52
Copy link

github-actions bot commented Apr 23, 2025

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff HEAD~1 HEAD --extensions h,cpp -- mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp mlir/include/mlir/Dialect/GPU/Transforms/Passes.h mlir/include/mlir/Dialect/GPU/Utils/GPUUtils.h mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp mlir/lib/Dialect/GPU/Utils/Utils.cpp mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
View the diff from clang-format here.
diff --git a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
index f766dab8c..3c42b1f4e 100644
--- a/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
+++ b/mlir/include/mlir/Dialect/GPU/Utils/ReductionUtils.h
@@ -9,9 +9,9 @@
 #ifndef MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
 #define MLIR_DIALECT_GPU_TRANSFORMS_REDUCTIONUTILS_H_
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
@@ -27,7 +27,7 @@ struct ClusterInfo {
 };
 
 FailureOr<ClusterInfo> getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
-  unsigned subgroupSize);
+                                                 unsigned subgroupSize);
 
 FailureOr<Value>
 createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index 57af63cbe..7f5e38b79 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -161,7 +161,8 @@ struct ScalarizeSingleElementReduce final
 
 //   std::optional<uint32_t> clusterSize = op.getClusterSize();
 //   assert(!clusterSize ||
-//          llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught this.
+//          llvm::isPowerOf2_32(*clusterSize)); // Verifier should've caught
+//          this.
 //   if (clusterSize && *clusterSize > subgroupSize)
 //     return op.emitOpError()
 //            << "cluster size " << *clusterSize
@@ -169,8 +170,8 @@ struct ScalarizeSingleElementReduce final
 //   unsigned effectiveClusterSize = clusterSize.value_or(subgroupSize);
 
 //   auto clusterStride = op.getClusterStride();
-//   assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught this.
-//   if (clusterStride >= subgroupSize)
+//   assert(llvm::isPowerOf2_32(clusterStride)); // Verifier should've caught
+//   this. if (clusterStride >= subgroupSize)
 //     return op.emitOpError()
 //            << "cluster stride " << clusterStride
 //            << " is not less than subgroup size " << subgroupSize;
@@ -369,7 +370,8 @@ private:
 };
 
 // FailureOr<Value>
-// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
+// createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp
+// op,
 //                            Value input, gpu::AllReduceOperation mode,
 //                            const ClusterInfo &ci, amdgpu::Chipset chipset) {
 //   Location loc = op.getLoc();
@@ -382,18 +384,22 @@ private:
 //     // Perform reduction between all lanes N <-> N+1.
 //     dpp = rewriter.create<amdgpu::DPPOp>(
 //         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-//         rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks, boundCtrl);
+//         rewriter.getI32ArrayAttr({1, 0, 3, 2}), allRows, allBanks,
+//         boundCtrl);
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 
 //   if (ci.clusterSize >= 4) {
 //     // Perform reduction between all lanes N <-> N+2.
 //     dpp = rewriter.create<amdgpu::DPPOp>(
 //         loc, res.getType(), res, res, amdgpu::DPPPerm::quad_perm,
-//         rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks, boundCtrl);
+//         rewriter.getI32ArrayAttr({2, 3, 0, 1}), allRows, allBanks,
+//         boundCtrl);
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 //   if (ci.clusterSize >= 8) {
 //     // Perform reduction between all lanes N <-> 7-N,
@@ -402,16 +408,18 @@ private:
 //         loc, res.getType(), res, res, amdgpu::DPPPerm::row_half_mirror,
 //         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 //   if (ci.clusterSize >= 16) {
 //     // Perform reduction between all lanes N <-> 15-N,
-//     // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <-> lane[8].
-//     dpp = rewriter.create<amdgpu::DPPOp>(
+//     // e.g lane[0] <-> lane[15], lane[1] <-> lane[14]..., lane[7] <->
+//     lane[8]. dpp = rewriter.create<amdgpu::DPPOp>(
 //         loc, res.getType(), res, res, amdgpu::DPPPerm::row_mirror,
 //         rewriter.getUnitAttr(), allRows, allBanks, boundCtrl);
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 //   if (ci.clusterSize >= 32) {
 //     if (chipset.majorVersion <= 9) {
@@ -427,7 +435,8 @@ private:
 //       // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
 //       Value uint32Max = rewriter.create<arith::ConstantOp>(
 //           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(-1));
-//       dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res, res,
+//       dpp = rewriter.create<ROCDL::PermlaneX16Op>(loc, res.getType(), res,
+//       res,
 //                                                   uint32Max, uint32Max,
 //                                                   /*fi=*/true,
 //                                                   /*bound_ctrl=*/false);
@@ -437,7 +446,8 @@ private:
 //         Value lane0 = rewriter.create<arith::ConstantOp>(
 //             loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
 //         res =
-//             rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//             rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res,
+//             lane0);
 //       }
 //     } else {
 //       return rewriter.notifyMatchFailure(
@@ -462,15 +472,17 @@ private:
 //           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
 //       Value lane32 = rewriter.create<arith::ConstantOp>(
 //           loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(32));
-//       dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane32);
-//       res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res, lane0);
+//       dpp = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(), res,
+//       lane32); res = rewriter.create<ROCDL::ReadlaneOp>(loc, res.getType(),
+//       res, lane0);
 //     } else {
 //       return rewriter.notifyMatchFailure(
 //           op, "Subgroup reduce lowering to DPP not currently supported for "
 //               "this device.");
 //     }
 //     res = vector::makeArithReduction(rewriter, loc,
-//                                      gpu::convertReductionKind(mode), res, dpp);
+//                                      gpu::convertReductionKind(mode), res,
+//                                      dpp);
 //   }
 //   assert(res.getType() == input.getType());
 //   return res;
@@ -484,8 +496,9 @@ struct ScalarSubgroupReduceToDPP final
   ScalarSubgroupReduceToDPP(MLIRContext *ctx, unsigned subgroupSize,
                             unsigned shuffleBitwidth, bool matchClustered,
                             amdgpu::Chipset chipset, PatternBenefit benefit)
-      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize), shuffleBitwidth(shuffleBitwidth),
-        matchClustered(matchClustered), chipset(chipset) {}
+      : OpRewritePattern(ctx, benefit), subgroupSize(subgroupSize),
+        shuffleBitwidth(shuffleBitwidth), matchClustered(matchClustered),
+        chipset(chipset) {}
 
   LogicalResult matchAndRewrite(gpu::SubgroupReduceOp op,
                                 PatternRewriter &rewriter) const override {
@@ -540,8 +553,9 @@ struct ScalarSubgroupReduceToDPP final
       return rewriter.create<arith::BitcastOp>(loc, valueTy, asInt);
     };
 
-    FailureOr<Value> dpp = createSubgroupDPPReduction(
-        rewriter, op, op.getValue(), op.getOp(), *ci, chipset, packFn, unpackFn);
+    FailureOr<Value> dpp =
+        createSubgroupDPPReduction(rewriter, op, op.getValue(), op.getOp(), *ci,
+                                   chipset, packFn, unpackFn);
     if (failed(dpp))
       return failure();
 
diff --git a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
index 2f50a1ec8..a0ef7f329 100644
--- a/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
+++ b/mlir/lib/Dialect/GPU/Utils/ReductionUtils.cpp
@@ -10,11 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
-#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/GPU/Utils/ReductionUtils.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -24,7 +24,7 @@
 using namespace mlir;
 
 FailureOr<ClusterInfo> mlir::getAndValidateClusterInfo(gpu::SubgroupReduceOp op,
-                                                 unsigned subgroupSize) {
+                                                       unsigned subgroupSize) {
   assert(llvm::isPowerOf2_32(subgroupSize));
 
   std::optional<uint32_t> clusterSize = op.getClusterSize();
@@ -51,7 +51,7 @@ FailureOr<Value> mlir::createSubgroupDPPReduction(
     gpu::AllReduceOperation mode, const ClusterInfo &ci,
     amdgpu::Chipset chipset, function_ref<Value(Value)> packFn,
     function_ref<Value(Value)> unpackFn) {
-  
+
   Location loc = op.getLoc();
   Value dpp;
   Value res = input;
diff --git a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
index 4ebcf897f..fd8b34288 100644
--- a/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
+++ b/mlir/test/lib/Dialect/GPU/TestGpuRewrite.cpp
@@ -93,9 +93,11 @@ struct TestGpuSubgroupReduceLoweringPass
       auto maybeChipset = amdgpu::Chipset::parse(target);
       if (succeeded(maybeChipset)) {
         populateGpuLowerSubgroupReduceToDPPPatterns(
-            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32,
+            *maybeChipset, PatternBenefit(2));
         populateGpuLowerClusteredSubgroupReduceToDPPPatterns(
-            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32, *maybeChipset, PatternBenefit(2));
+            patterns, /*subgroupSize=*/64, /*shuffleBitwidth=*/32,
+            *maybeChipset, PatternBenefit(2));
       }
       populateGpuLowerSubgroupReduceToShufflePatterns(
           patterns, /*subgroupSize=*/32, /*shuffleBitwidth=*/32);

Signed-off-by: Muzammiluddin Syed <muzasyed@amd.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant