Skip to content

[mlir] GPUToROCDL: Add support for non-i32/f32 shuffle types #136320

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -840,7 +840,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
- a variadic number of Private memory attributions.

The `kernelFunc` and `kernelModule` attributes are optional and specifies
the kernel name and a module in which the kernel should be outlined.
the kernel name and a module in which the kernel should be outlined.

Syntax:

Expand Down Expand Up @@ -1201,7 +1201,7 @@ def GPU_AllReduceOp : GPU_Op<"all_reduce",
}

def AnyIntegerOrFloatOr1DVector :
AnyTypeOf<[AnyIntegerOrFloat, VectorOfRankAndType<[1], [AnyIntegerOrFloat]>]>;
AnyTypeOf<[AnyIntegerOrFloat, FixedVectorOfRankAndType<[1], [AnyIntegerOrFloat]>]>;

def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]> {
let summary = "Reduce values among subgroup.";
Expand Down
26 changes: 12 additions & 14 deletions mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
return canBeBare;
}

Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
const unsigned indexBitwidth) {
static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
const unsigned indexBitwidth) {
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
Expand Down Expand Up @@ -138,10 +138,6 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
Location loc = op->getLoc();
Value initShflValue = adaptor.getValue();
Type shflType = initShflValue.getType();
// TODO: Add support for non 32-bit shuffle values.
if (!shflType.isIntOrFloat() || shflType.getIntOrFloatBitWidth() != 32)
return rewriter.notifyMatchFailure(
op, "only 32-bit int/float types are supported");

const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
Expand Down Expand Up @@ -179,15 +175,17 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
Value dwordAlignedDstLane =
rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
if (shflType.isF32()) {
initShflValue =
rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
}
Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>(
loc, int32Type, dwordAlignedDstLane, initShflValue);
if (shflType.isF32()) {
shflValue = rewriter.create<LLVM::BitcastOp>(loc, shflType, shflValue);

SmallVector<Value> decomposed =
LLVM::decomposeValue(rewriter, loc, initShflValue, int32Type);
SmallVector<Value> swizzled;
for (Value v : decomposed) {
Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,
dwordAlignedDstLane, v);
swizzled.emplace_back(res);
}
Value shflValue =
LLVM::composeValue(rewriter, loc, swizzled, initShflValue.getType());
rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
return success();
}
Expand Down
13 changes: 0 additions & 13 deletions mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-unsupported.mlir

This file was deleted.

21 changes: 21 additions & 0 deletions mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,27 @@ gpu.module @test_module {
%shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
func.return %shfl, %shfli, %shfld : f32, f32, f32
}

// CHECK-LABEL: func @gpu_shuffle_vec
// CHECK-SAME: (%[[ARG:.*]]: vector<4xf16>, %{{.*}}: i32, %{{.*}}: i32)
func.func @gpu_shuffle_vec(%arg0: vector<4xf16>, %arg1: i32, %arg2: i32) -> vector<4xf16> {
// CHECK: %[[CAST1:.*]] = llvm.bitcast %[[ARG]] : vector<4xf16> to vector<2xi32>
// CHECK: %[[IDX0:.*]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: %[[ELEM0:.*]] = llvm.extractelement %13[%[[IDX0]] : i32] : vector<2xi32>
// CHECK: %[[IDX1:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: %[[ELEM1:.*]] = llvm.extractelement %13[%[[IDX1]] : i32] : vector<2xi32>
// CHECK: %[[PERM0:.*]] = rocdl.ds_bpermute %{{.*}}, %[[ELEM0]] : (i32, i32) -> i32
// CHECK: %[[PERM1:.*]] = rocdl.ds_bpermute %{{.*}}, %[[ELEM1]] : (i32, i32) -> i32
// CHECK: %[[V0:.*]] = llvm.mlir.poison : vector<2xi32>
// CHECK: %[[IDX0:.*]] = llvm.mlir.constant(0 : i32) : i32
// CHECK: %[[V1:.*]] = llvm.insertelement %[[PERM0]], %[[V0]][%[[IDX0]] : i32] : vector<2xi32>
// CHECK: %[[IDX1:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: %[[V2:.*]] = llvm.insertelement %[[PERM1]], %[[V1]][%[[IDX1]] : i32] : vector<2xi32>
// CHECK: %[[RES:.*]] = llvm.bitcast %[[V2]] : vector<2xi32> to vector<4xf16>
// CHECK: llvm.return %[[RES]] : vector<4xf16>
%shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : vector<4xf16>
func.return %shfl : vector<4xf16>
}
}

// -----
Expand Down
14 changes: 11 additions & 3 deletions mlir/test/Dialect/GPU/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -367,15 +367,15 @@ func.func @subgroup_reduce_cluster_stride_without_size(%arg0 : vector<4xf32>) {
// -----

func.func @subgroup_reduce_bad_type(%arg0 : vector<2x2xf32>) {
// expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float or vector of}}
// expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float or fixed-length vector of}}
%res = gpu.subgroup_reduce add %arg0 : (vector<2x2xf32>) -> vector<2x2xf32>
return
}

// -----

func.func @subgroup_reduce_bad_type_scalable(%arg0 : vector<[2]xf32>) {
// expected-error@+1 {{is not compatible with scalable vector types}}
// expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float or fixed-length vector of}}
%res = gpu.subgroup_reduce add %arg0 : (vector<[2]xf32>) -> vector<[2]xf32>
return
}
Expand Down Expand Up @@ -463,13 +463,21 @@ func.func @shuffle_mismatching_type(%arg0 : f32, %arg1 : i32, %arg2 : i32) {
// -----

func.func @shuffle_unsupported_type(%arg0 : index, %arg1 : i32, %arg2 : i32) {
// expected-error@+1 {{op operand #0 must be Integer or Float or vector of Integer or Float values of ranks 1, but got 'index'}}
// expected-error@+1 {{op operand #0 must be Integer or Float or fixed-length vector of Integer or Float values of ranks 1, but got 'index'}}
%shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : index
return
}

// -----

func.func @shuffle_unsupported_type_vec(%arg0 : vector<[4]xf32>, %arg1 : i32, %arg2 : i32) {
// expected-error@+1 {{op operand #0 must be Integer or Float or fixed-length vector of Integer or Float values of ranks 1, but got 'vector<[4]xf32>'}}
%shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : vector<[4]xf32>
return
}

// -----

module {
gpu.module @gpu_funcs {
// expected-error @+1 {{custom op 'gpu.func' gpu.func requires named arguments}}
Expand Down
Loading