Skip to content

[mlir][gpu] Pattern to promote gpu.shuffle to specialized AMDGPU ops #137109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion mlir/include/mlir/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
let constructor = "mlir::createLowerGpuOpsToROCDLOpsPass()";
let dependentDialects = [
"ROCDL::ROCDLDialect",
"amdgpu::AMDGPUDialect",
"cf::ControlFlowDialect",
"memref::MemRefDialect",
];
Expand Down Expand Up @@ -1415,7 +1416,7 @@ def ConvertVectorToLLVMPass : Pass<"convert-vector-to-llvm"> {
"bool", /*default=*/"false",
"Use the preferred alignment of a vector type in load/store "
"operations instead of the alignment of the element type of the "
"memref. This flag is intended for use with hardware which requires"
"memref. This flag is intended for use with hardware which requires"
"vector alignment, or in application contexts where it is known all "
"vector access are naturally aligned. ">,
Option<"amx", "enable-amx",
Expand Down
27 changes: 19 additions & 8 deletions mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -132,24 +132,24 @@ def MapNestedForallToThreads :
TransformEachOpTrait,
TransformOpInterface]> {
let description = [{
Target the `gpu.launch op` and rewrite all `scf.forall` nested in it to
Target the `gpu.launch op` and rewrite all `scf.forall` nested in it to
distributed `gpu.thread_id` attribute.

The operation searches for `scf.forall` ops nested under `target` and maps
each such op to GPU threads.
each such op to GPU threads.

`scf.forall` induction variables are rewritten to `gpu.thread_id` according
to the `mapping` attribute.

Different types of mappings attributes are supported:
- the block_dims is a list of integers that specifies the number of
threads in each dimension. This is a mandatory attribute that is used
to constrain the number of threads in each dimension. If an
to constrain the number of threads in each dimension. If an
`scf.forall` op is mapped to fewer threads, predication occurs.
- the warp_dims is a list of integers that specifies the number of
warps in each dimension. This is an optional attribute that is used
to constrain the number of warps in each dimension. When present, this
attribute must be specified in a way that is compatible with the
attribute must be specified in a way that is compatible with the
block_dims attribute. If an `scf.forall` op is mapped to fewer warps,
predication occurs.

Expand All @@ -164,7 +164,7 @@ def MapNestedForallToThreads :
inserted after each scf.forall op. At this time, this is an all or nothing
choice. This will need to be tightened in the future.

The operation alters the block size of the given gpu_launch using the
The operation alters the block size of the given gpu_launch using the
mandatory block_dims argument.

#### Return modes:
Expand Down Expand Up @@ -268,7 +268,7 @@ def MapForallToBlocks :
Only scf.forall distributed to **at most 3 dimensions** are
currently supported.

The operation alters the block size of the given gpu_launch using the
The operation alters the block size of the given gpu_launch using the
grid_dims argument.

#### Return modes:
Expand Down Expand Up @@ -300,7 +300,7 @@ def MapForallToBlocks :
`:` functional-type($target, $result)
}];
let hasVerifier = 1;

let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformRewriter &rewriter,
Expand All @@ -310,4 +310,15 @@ def MapForallToBlocks :
}];
}

def ApplyGPUPromoteShuffleToAMDGPUPatternsOp : Op<Transform_Dialect,
"apply_patterns.gpu.gpu_shuffle_to_amdgpu",
[DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
let description = [{
Collects patterns that are tryin to promote `gpu.shuffle`s to specialized
AMDGPU intrinsics.
}];
let assemblyFormat = "attr-dict";
}


#endif // GPU_TRANSFORM_OPS
3 changes: 3 additions & 0 deletions mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ void populateGpuDecomposeMemrefsPatterns(RewritePatternSet &patterns);
/// Erase barriers that do not enforce conflicting memory side effects.
void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns);

/// Tries to promote `gpu.shuffle`s to specialized AMDGPU intrinsics.
void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns);

/// Generate the code for registering passes.
#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
Expand Down
5 changes: 3 additions & 2 deletions mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
#include "mlir/Conversion/MathToROCDL/MathToROCDL.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
Expand Down Expand Up @@ -197,8 +198,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
Value widthOrZeroIfOutside =
rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
Value dstLane;
// TODO: Use ds_swizzle for XOR when step/offsets are constants for better
// perf.

switch (op.getMode()) {
case gpu::ShuffleMode::UP:
dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,
Expand Down Expand Up @@ -319,6 +319,7 @@ struct LowerGpuOpsToROCDLOpsPass final
{
RewritePatternSet patterns(ctx);
populateGpuRewritePatterns(patterns);
populateGpuPromoteShuffleToAMDGPUPatterns(patterns);
(void)applyPatternsGreedily(m, std::move(patterns));
}

Expand Down
11 changes: 6 additions & 5 deletions mlir/lib/Dialect/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ add_mlir_dialect_library(MLIRGPUTransforms
Transforms/ModuleToBinary.cpp
Transforms/NVVMAttachTarget.cpp
Transforms/ParallelLoopMapper.cpp
Transforms/PromoteShuffleToAMDGPU.cpp
Transforms/ROCDLAttachTarget.cpp
Transforms/ShuffleRewriter.cpp
Transforms/SPIRVAttachTarget.cpp
Transforms/ShuffleRewriter.cpp
Transforms/SubgroupIdRewriter.cpp
Transforms/SubgroupReduceLowering.cpp

Expand All @@ -53,8 +54,8 @@ add_mlir_dialect_library(MLIRGPUTransforms
MLIRParallelLoopMapperEnumsGen

LINK_LIBS PUBLIC
MLIRAffineUtils
MLIRAMDGPUDialect
MLIRAffineUtils
MLIRArithDialect
MLIRAsyncDialect
MLIRBufferizationDialect
Expand All @@ -68,12 +69,12 @@ add_mlir_dialect_library(MLIRGPUTransforms
MLIRMemRefDialect
MLIRNVVMTarget
MLIRPass
MLIRROCDLDialect
MLIRROCDLTarget
MLIRSCFDialect
MLIRSideEffectInterfaces
MLIRSPIRVTarget
MLIRSideEffectInterfaces
MLIRSupport
MLIRROCDLDialect
MLIRROCDLTarget
MLIRTransformUtils
MLIRVectorDialect
)
Expand Down
11 changes: 9 additions & 2 deletions mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
Expand Down Expand Up @@ -136,6 +137,11 @@ void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) {
populateGpuRewritePatterns(patterns);
}

void transform::ApplyGPUPromoteShuffleToAMDGPUPatternsOp::populatePatterns(
RewritePatternSet &patterns) {
populateGpuPromoteShuffleToAMDGPUPatterns(patterns);
}

//===----------------------------------------------------------------------===//
// ApplyUnrollVectorsSubgroupMmaOp
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -914,9 +920,10 @@ class GPUTransformDialectExtension
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUTransformDialectExtension)

GPUTransformDialectExtension() {
declareGeneratedDialect<scf::SCFDialect>();
declareGeneratedDialect<arith::ArithDialect>();
declareGeneratedDialect<GPUDialect>();
declareGeneratedDialect<amdgpu::AMDGPUDialect>();
declareGeneratedDialect<arith::ArithDialect>();
declareGeneratedDialect<scf::SCFDialect>();
registerTransformOps<
#define GET_OP_LIST
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
Expand Down
64 changes: 64 additions & 0 deletions mlir/lib/Dialect/GPU/Transforms/PromoteShuffleToAMDGPU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
//===- PromoteShuffleToAMDGPU.cpp - Promote shuffle to AMDGPU -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains patterns to try to promote `gpu.shuffle`s to specialized
// AMDGPU intrinsics.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/Transforms/Passes.h"

#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/IR/PatternMatch.h"

using namespace mlir;

namespace {
/// Try to promote `gpu.shuffle` to `amdgpu.swizzle_bitmode`, width must be 64
/// and offset must be a constant integer in the range [0, 31].
struct PromoteShuffleToSwizzlePattern
: public OpRewritePattern<gpu::ShuffleOp> {
using OpRewritePattern::OpRewritePattern;

LogicalResult matchAndRewrite(gpu::ShuffleOp op,
PatternRewriter &rewriter) const override {
if (op.getMode() != gpu::ShuffleMode::XOR)
return rewriter.notifyMatchFailure(op,
"only xor shuffle mode is supported");

if (!isConstantIntValue(op.getWidth(), 64))
return rewriter.notifyMatchFailure(op,
"only 64 width shuffle is supported");

std::optional<int64_t> offset = getConstantIntValue(op.getOffset());
if (!offset)
return rewriter.notifyMatchFailure(op,
"offset must be a constant integer");

int64_t offsetValue = *offset;
if (offsetValue < 0 || offsetValue >= 32)
return rewriter.notifyMatchFailure(op,
"offset must be in the range [0, 31]");

Location loc = op.getLoc();
Value res = rewriter.create<amdgpu::SwizzleBitModeOp>(
loc, op.getResult(0).getType(), op.getValue(), /*andMask=*/31,
/*orMask=*/0, /*xorMask=*/offsetValue);
Value valid = rewriter.create<arith::ConstantIntOp>(loc, 1, /*width*/ 1);
rewriter.replaceOp(op, {res, valid});
return success();
}
};
} // namespace

void mlir::populateGpuPromoteShuffleToAMDGPUPatterns(
RewritePatternSet &patterns) {
patterns.add<PromoteShuffleToSwizzlePattern>(patterns.getContext());
}
23 changes: 23 additions & 0 deletions mlir/test/Dialect/GPU/promote-shuffle-amdgpu.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// RUN: mlir-opt --transform-interpreter --split-input-file %s | FileCheck %s

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
%func = transform.structured.match ops{["func.func"]} in %module_op : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func {
transform.apply_patterns.gpu.gpu_shuffle_to_amdgpu
} : !transform.any_op
transform.yield
}
}

// CHECK-LABEL: func @gpu_shuffle_swizzle
// CHECK-SAME: (%[[ARG:.*]]: i32)
func.func @gpu_shuffle_swizzle(%arg0: i32) -> (i32, i1) {
// CHECK: %[[TRUE:.*]] = arith.constant true
// CHECK: %[[RES:.*]] = amdgpu.swizzle_bitmode %[[ARG]] 31 0 23 : i32
// CHECK: return %[[RES]], %[[TRUE]] : i32, i1
%width = arith.constant 64 : i32
%offset = arith.constant 23 : i32
%shfl, %pred = gpu.shuffle xor %arg0, %offset, %width : i32
func.return %shfl, %pred : i32, i1
}