-
Notifications
You must be signed in to change notification settings - Fork 14k
AMDGPU: Support local atomicrmw fmin/fmax for float/double #95590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Support local atomicrmw fmin/fmax for float/double #95590
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis has always been supported. Somehow, we ended up with 2 Patch is 220.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95590.diff 17 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 231db188e65dc..abdcf7b28aea6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -271,11 +271,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>;
-
+def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
+def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b50c0cc126261..d60c62a30e7ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -502,9 +502,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
// isa<MemSDNode> almost works but is slightly too permissive for some DS
// intrinsics.
- if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) {
+ if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
N = glueCopyToM0LDSInit(N);
SelectCode(N);
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 18193d8807597..3dd2d7fb35da4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5524,8 +5524,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(DS_ORDERED_COUNT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
- NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
- NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 71c4334029b43..f1aa7e44688b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -575,8 +575,6 @@ enum NodeType : unsigned {
TBUFFER_LOAD_FORMAT_D16,
DS_ORDERED_COUNT,
ATOMIC_CMP_SWAP,
- ATOMIC_LOAD_FMIN,
- ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
BUFFER_LOAD_UBYTE,
BUFFER_LOAD_USHORT,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ae3f2b87f353d..03e2d622dd319 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3620,8 +3620,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
case TargetOpcode::G_ATOMICRMW_FADD:
- case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
- case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
+ case TargetOpcode::G_ATOMICRMW_FMIN:
+ case TargetOpcode::G_ATOMICRMW_FMAX:
return selectG_LOAD_STORE_ATOMICRMW(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 783bc9d7ef593..7e9f79a89ff7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -682,6 +682,8 @@ defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>;
defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>;
defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>;
defm atomic_load_fadd : binary_atomic_op_fp_all_as<atomic_load_fadd>;
+defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
+defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee7fb20c23aa7..207ecd6ffdcf6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -283,7 +283,9 @@ static const LLT S1 = LLT::scalar(1);
static const LLT S8 = LLT::scalar(8);
static const LLT S16 = LLT::scalar(16);
static const LLT S32 = LLT::scalar(32);
+static const LLT F32 = LLT::float32();
static const LLT S64 = LLT::scalar(64);
+static const LLT F64 = LLT::float64();
static const LLT S96 = LLT::scalar(96);
static const LLT S128 = LLT::scalar(128);
static const LLT S160 = LLT::scalar(160);
@@ -1645,6 +1647,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasFlatAtomicFaddF32Inst())
Atomic.legalFor({{S32, FlatPtr}});
+ getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
+ .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
+
if (ST.hasGFX90AInsts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
@@ -5391,9 +5396,9 @@ static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
case Intrinsic::amdgcn_ds_fadd:
return AMDGPU::G_ATOMICRMW_FADD;
case Intrinsic::amdgcn_ds_fmin:
- return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
+ return AMDGPU::G_ATOMICRMW_FMIN;
case Intrinsic::amdgcn_ds_fmax:
- return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
+ return AMDGPU::G_ATOMICRMW_FMAX;
default:
llvm_unreachable("not a DS FP intrinsic");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7ebd674757fbc..a072786eeaba2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5221,11 +5221,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_UMAX:
case AMDGPU::G_ATOMICRMW_UMIN:
case AMDGPU::G_ATOMICRMW_FADD:
+ case AMDGPU::G_ATOMICRMW_FMIN:
+ case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
- case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
- case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
- case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
+ case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81098201e9c0f..74c2ecbbc77ea 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -945,6 +945,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_UMIN,
ISD::ATOMIC_LOAD_UMAX,
ISD::ATOMIC_LOAD_FADD,
+ ISD::ATOMIC_LOAD_FMIN,
+ ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
ISD::INTRINSIC_VOID,
@@ -8706,25 +8708,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc;
- switch (IntrID) {
- case Intrinsic::amdgcn_ds_fmin:
- Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
- break;
- case Intrinsic::amdgcn_ds_fmax:
- Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
- break;
- default:
- llvm_unreachable("Unknown intrinsic!");
- }
- SDValue Ops[] = {
- M->getOperand(0), // Chain
- M->getOperand(2), // Ptr
- M->getOperand(3) // Value
- };
-
- return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
+ : ISD::ATOMIC_LOAD_FMAX;
+ return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
+ M->getOperand(2), M->getOperand(3),
+ M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
@@ -9135,22 +9123,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin_num: {
- Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ Opcode = ISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmax_num: {
- Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ Opcode = ISD::ATOMIC_LOAD_FMAX;
break;
}
default:
llvm_unreachable("unhandled atomic opcode");
}
- return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
- M->getVTList(), Ops, M->getMemoryVT(),
- M->getMemOperand());
+ return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
+ Ops, M->getMemOperand());
}
case Intrinsic::amdgcn_s_get_barrier_state: {
SDValue Chain = Op->getOperand(0);
@@ -15821,8 +15808,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
case AMDGPUISD::ATOMIC_CMP_SWAP:
- case AMDGPUISD::ATOMIC_LOAD_FMIN:
- case AMDGPUISD::ATOMIC_LOAD_FMAX:
case AMDGPUISD::BUFFER_ATOMIC_SWAP:
case AMDGPUISD::BUFFER_ATOMIC_ADD:
case AMDGPUISD::BUFFER_ATOMIC_SUB:
@@ -16042,7 +16027,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}
case AtomicRMWInst::FMin:
- case AtomicRMWInst::FMax:
+ case AtomicRMWInst::FMax: {
+ Type *Ty = RMW->getType();
+
+ // LDS float and double fmin/fmax were always supported.
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
+ return AtomicExpansionKind::None;
+
+ return AtomicExpansionKind::CmpXChg;
+ }
case AtomicRMWInst::Min:
case AtomicRMWInst::Max:
case AtomicRMWInst::UMin:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6682763210411..68780dae085a8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -72,14 +72,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
-def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
-def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
// load_d16_{lo|hi} ptr, tied_input
def SIload_d16 : SDTypeProfile<1, 2, [
SDTCisPtrTy<1>,
@@ -314,13 +306,6 @@ class isIntType<ValueType SrcVT> {
bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
}
-//===----------------------------------------------------------------------===//
-// PatFrags for global memory operations
-//===----------------------------------------------------------------------===//
-
-defm atomic_load_fmin : binary_atomic_op_fp_all_as<SIatomic_fmin>;
-defm atomic_load_fmax : binary_atomic_op_fp_all_as<SIatomic_fmax>;
-
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
// This is for SDNodes and PatFrag for local loads and stores to
@@ -743,8 +728,8 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
-defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
-defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 0, SDTAtomic2_f32, 0>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 0, SDTAtomic2_f32, 0>;
def as_i1timm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ba31027da92e8..71c47eaff79ec 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3863,11 +3863,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
let mayStore = 1;
}
-let Namespace = "AMDGPU" in {
-def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
-def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
-}
-
class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ef635fd081e11..e6a439eb58ce4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -590,9 +590,7 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
}
bool isGenericAtomic(unsigned Opc) {
- return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
- Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
- Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
+ return Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN ||
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index 6581c5c3ca9c7..f2ba7f8b21932 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -98,12 +98,13 @@ body: |
%2:_(s32) = IMPLICIT_DEF
%3:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
%4:_(s32) = G_CONSTANT i32 0
+ %ptr_lds:_(p3) = G_IMPLICIT_DEF
- ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_ATOMIC_FMIN
- %5:_(s32) = G_AMDGPU_ATOMIC_FMIN %0, %3
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMIN
+ %5:_(s32) = G_ATOMICRMW_FMIN %ptr_lds, %4
- ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_ATOMIC_FMAX
- %6:_(s32) = G_AMDGPU_ATOMIC_FMAX %0, %3
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMAX
+ %6:_(s32) = G_ATOMICRMW_FMAX %ptr_lds, %4
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_BUFFER_ATOMIC_SWAP
%7:_(s32) = G_AMDGPU_BUFFER_ATOMIC_SWAP %0, %3, %4, %4, %4, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 6c227c707fbf1..9a6ac12702a5d 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -21,212 +21,82 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr) nounwind {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: ds_load_b32 v1, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v1, v2, v2
-; GFX12-NEXT: v_max_num_f32_e32 v1, 4.0, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, 4.0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
; GFX12-NEXT: s_wait_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: local_atomic_fmax_ret_f32:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: ds_read_b32 v1, v0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, v1
-; GFX940-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX940-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX940-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX940-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB0_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: local_atomic_fmax_ret_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: ds_load_b32 v1, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX11-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, 4.0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2
+; GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: local_atomic_fmax_ret_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: ds_read_b32 v1, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v1
-; GFX10-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX10-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, 4.0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
be5b0b5
to
6745a06
Compare
This has always been supported. Somehow, we ended up with 2 copies of clang builtins for this case, and the newer one erroneously requires gfx8-insts.
6745a06
to
2cb5d11
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This has always been supported. Somehow, we ended up with 2 copies of clang builtins for this case, and the newer one erroneously requires gfx8-insts.
This has always been supported. Somehow, we ended up with 2
copies of clang builtins for this case, and the newer one
erroneously requires gfx8-insts.