-
Notifications
You must be signed in to change notification settings - Fork 13.9k
[AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR #125836
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-amdgpu Author: Fabian Ritter (ritter-x2a) Changesgfx940 and gfx941 are no longer supported. This is one of a series of For SWDEV-512631 Full diff: https://github.com/llvm/llvm-project/pull/125836.diff 10 Files Affected:
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 69745addfd748ec..24f541587cba88a 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -602,7 +602,7 @@ def AMDGPU_MFMAOp :
order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
The negateA, negateB, and negateC flags are only supported for double-precision
- operations on gfx940+.
+ operations on gfx942+.
}];
let assemblyFormat = [{
$sourceA `*` $sourceB `+` $destC
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 974712c581537a9..8b4f7e49f573d68 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -382,11 +382,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k">
def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">;
def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">;
def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">;
-// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a
-// NEG bitfield. See IntrinsicsAMDGPU.td for more info.
+// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as
+// a NEG bitfield. See IntrinsicsAMDGPU.td for more info.
def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">;
def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">;
-// New in gfx940.
+// New in gfx942.
def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">;
def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">;
def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">;
@@ -409,7 +409,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">;
def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>;
def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>;
-// 2:4 Sparsity ops (GFX940)
+// 2:4 Sparsity ops (GFX942)
def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">;
def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">;
def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 51f5d7a161b9030..f67d174d7d7c9a6 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -80,7 +80,7 @@ namespace {
// Define commonly used chipsets versions for convenience.
constexpr Chipset kGfx908 = Chipset(9, 0, 8);
constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
-constexpr Chipset kGfx940 = Chipset(9, 4, 0);
+constexpr Chipset kGfx942 = Chipset(9, 4, 2);
/// Define lowering patterns for raw buffer ops
template <typename GpuOp, typename Intrinsic>
@@ -483,7 +483,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
destElem = destType.getElementType();
if (sourceElem.isF32() && destElem.isF32()) {
- if (mfma.getReducePrecision() && chipset >= kGfx940) {
+ if (mfma.getReducePrecision() && chipset >= kGfx942) {
if (m == 32 && n == 32 && k == 4 && b == 1)
return ROCDL::mfma_f32_32x32x4_xf32::getOperationName();
if (m == 16 && n == 16 && k == 8 && b == 1)
@@ -551,9 +551,9 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
return ROCDL::mfma_i32_32x32x8i8::getOperationName();
if (m == 16 && n == 16 && k == 16 && b == 1)
return ROCDL::mfma_i32_16x16x16i8::getOperationName();
- if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940)
+ if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx942)
return ROCDL::mfma_i32_32x32x16_i8::getOperationName();
- if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx940)
+ if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx942)
return ROCDL::mfma_i32_16x16x32_i8::getOperationName();
}
@@ -565,7 +565,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
}
if (isa<Float8E5M2FNUZType>(sourceElem) && destElem.isF32() &&
- chipset >= kGfx940) {
+ chipset >= kGfx942) {
// Known to be correct because there are no scalar f8 instructions and
// because a length mismatch will have been caught by the verifier.
Type sourceBElem =
@@ -585,7 +585,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
}
if (isa<Float8E4M3FNUZType>(sourceElem) && destElem.isF32() &&
- chipset >= kGfx940) {
+ chipset >= kGfx942) {
Type sourceBElem =
cast<VectorType>(mfma.getSourceB().getType()).getElementType();
if (m == 16 && n == 16 && k == 32 && b == 1) {
@@ -653,8 +653,8 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern<MFMAOp> {
return op->emitOpError("MFMA only supported on gfx908+");
uint32_t getBlgpField = static_cast<uint32_t>(op.getBlgp());
if (op.getNegateA() || op.getNegateB() || op.getNegateC()) {
- if (chipset < kGfx940)
- return op.emitOpError("negation unsupported on older than gfx940");
+ if (chipset < kGfx942)
+ return op.emitOpError("negation unsupported on older than gfx942");
getBlgpField |=
op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2);
}
@@ -775,7 +775,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
- if (chipset.majorVersion != 9 || chipset < kGfx940)
+ if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
@@ -819,7 +819,7 @@ LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite(
PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
- if (chipset.majorVersion != 9 || chipset < kGfx940)
+ if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
@@ -856,7 +856,7 @@ LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite(
PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
- if (chipset.majorVersion != 9 || chipset < kGfx940)
+ if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 33370566996eee5..338a2124149a844 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -381,7 +381,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() {
}
bool convertFP8Arithmetic =
- maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0);
+ maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2);
arith::populateArithToAMDGPUConversionPatterns(
patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz,
*maybeChipset);
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
index 77f972e0e589445..7459a6503cddfb4 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -179,7 +179,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
}
// gfx9 has no to a very limited support for floating-point min and max.
if (chipset.majorVersion == 9) {
- if (chipset >= Chipset(9, 0, 0xa) && chipset != Chipset(9, 4, 1)) {
+ if (chipset >= Chipset(9, 0, 0xa)) {
// gfx90a supports f64 max (and min, but we don't have a min wrapper right
// now) but all other types need to be emulated.
target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>(
@@ -189,12 +189,6 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
} else {
target.addIllegalOp<RawBufferAtomicFmaxOp>();
}
- if (chipset == Chipset(9, 4, 1)) {
- // gfx941 requires non-CAS atomics to be implemented with CAS loops.
- // The workaround here mirrors HIP and OpenMP.
- target.addIllegalOp<RawBufferAtomicFaddOp, RawBufferAtomicFmaxOp,
- RawBufferAtomicSmaxOp, RawBufferAtomicUminOp>();
- }
}
patterns.add<
RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir
index 7818a525d17b537..a313aaffdf5ccca 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 | FileCheck %s
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s
// CHECK-LABEL: func @ext_scalar
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
index f8a60d37801ebec..52db1421dc3c686 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 -cse | FileCheck %s
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 -cse | FileCheck %s
func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>,
%arg2 : vector<16xf32>, %arg3 : vector<4xf32>,
%arg4 : vector<4xf16>, %arg5 : vector<4xi8>,
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
index cd921da2294e13b..07a428566d48852 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt --split-input-file %s \
-// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \
+// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx942 saturate-fp8-truncf=true}))' \
// RUN: | FileCheck %s
// CHECK-LABEL: func.func @scalar_trunc
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
index bd90facb6154408..da834ff4a111526 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s
+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" | FileCheck %s
// CHECK-LABEL: func.func @scalar_ext
// CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ)
diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
index 976ff2e7382edf7..570d56f3c6ff139 100644
--- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
+++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
@@ -19,11 +19,11 @@ TEST(ChipsetTest, Parsing) {
EXPECT_EQ(chipset->minorVersion, 0u);
EXPECT_EQ(chipset->steppingVersion, 0xau);
- chipset = Chipset::parse("gfx940");
+ chipset = Chipset::parse("gfx942");
ASSERT_TRUE(succeeded(chipset));
EXPECT_EQ(chipset->majorVersion, 9u);
EXPECT_EQ(chipset->minorVersion, 4u);
- EXPECT_EQ(chipset->steppingVersion, 0u);
+ EXPECT_EQ(chipset->steppingVersion, 2u);
chipset = Chipset::parse("gfx1103");
ASSERT_TRUE(succeeded(chipset));
@@ -36,30 +36,26 @@ TEST(ChipsetTest, ParsingInvalid) {
EXPECT_TRUE(failed(Chipset::parse("navi33")));
EXPECT_TRUE(failed(Chipset::parse("rdna2")));
EXPECT_TRUE(failed(Chipset::parse("sm_80")));
- EXPECT_TRUE(failed(Chipset::parse("GFX940")));
- EXPECT_TRUE(failed(Chipset::parse("Gfx940")));
+ EXPECT_TRUE(failed(Chipset::parse("GFX942")));
+ EXPECT_TRUE(failed(Chipset::parse("Gfx942")));
EXPECT_TRUE(failed(Chipset::parse("gfx9")));
- EXPECT_TRUE(failed(Chipset::parse("gfx_940")));
- EXPECT_TRUE(failed(Chipset::parse("gfx940_")));
+ EXPECT_TRUE(failed(Chipset::parse("gfx_942")));
+ EXPECT_TRUE(failed(Chipset::parse("gfx942_")));
EXPECT_TRUE(failed(Chipset::parse("gfxmeow")));
EXPECT_TRUE(failed(Chipset::parse("gfx1fff")));
}
TEST(ChipsetTest, Comparison) {
- EXPECT_EQ(Chipset(9, 4, 0), Chipset(9, 4, 0));
- EXPECT_NE(Chipset(9, 4, 0), Chipset(9, 4, 2));
+ EXPECT_EQ(Chipset(9, 4, 2), Chipset(9, 4, 2));
EXPECT_NE(Chipset(9, 0, 0), Chipset(10, 0, 0));
EXPECT_LT(Chipset(9, 0, 0), Chipset(10, 0, 0));
EXPECT_LT(Chipset(9, 0, 0), Chipset(9, 4, 2));
- EXPECT_LE(Chipset(9, 4, 1), Chipset(9, 4, 1));
EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 2));
- EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 0));
EXPECT_GT(Chipset(9, 0, 0xa), Chipset(9, 0, 8));
EXPECT_GE(Chipset(9, 0, 0xa), Chipset(9, 0, 0xa));
- EXPECT_FALSE(Chipset(9, 4, 1) >= Chipset(9, 4, 2));
- EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 0));
+ EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2));
}
} // namespace
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These seem too aggressive. we're not deleting the 940 target, and all the same properties apply
As far as I'm aware deleting the 940 target is the goal (the PRs for that don't exist yet, but should end up in this stack) |
Ok so I'm going to take issue with jumping the constants in a bunch of tests to gfx8942. Unless we're planning to completely wipe gfx940/1 from the codebase - which seems extremely unusual and like bad practice ... gfx940 and 941 are real targets that did really exist and might need code compiled for them in the future - we should still allow compiling for there targets while updating tests and such to gfx942. And as to the atomics emulation pass ... what's the harm in keeping some old workaround around? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok yeah, just to be safe, blocker on actually making things that work fine unsupported do this doesn't land overnight, ping me tomorrow
I'll bring this issue up for discussion in the next compiler team meeting. |
I'll try and remember to go to that |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Per further discussion, given the broader context of these changes, this is fine assuming @arsenm's comments about the documentation are addressed.
656adb4
to
0c95e13
Compare
97f4e5a
to
175aff5
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this essentially breaks logic for gfx940 and gfx941, should we assert in code like Chipset
that these are not used and silently miscompiled?
@kuhar as far as I can see that would be the first check for invalid targets in this part of the code base. I don't think we should treat gfx940/gfx941 differently than any other invalid target. We will only merge this PR together with the rest of the stack, which will remove gfx940\gfx941 code generation from the rest of LLVM as well. |
0c95e13
to
8dd1c0a
Compare
175aff5
to
55e8283
Compare
8dd1c0a
to
c27c8b4
Compare
51b377d
to
876dba7
Compare
c27c8b4
to
61eb3cc
Compare
Merge activity
|
876dba7
to
21cf1a0
Compare
61eb3cc
to
a86e6a8
Compare
gfx940 and gfx941 are no longer supported. This is one of a series of PRs to remove them from the code base. For SWDEV-512631
a86e6a8
to
d623fc8
Compare
gfx940 and gfx941 are no longer supported. This is one of a series of
PRs to remove them from the code base.
For SWDEV-512631