Skip to content

Commit 4972ec2

Browse files
committed
AMDGPU: Support local atomicrmw fmin/fmax for float/double
This has always been supported. Somehow, we ended up with 2 copies of clang builtins for this case, and the newer one erroneously requires gfx8-insts.
1 parent 21ba91c commit 4972ec2

17 files changed

+479
-3039
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -271,11 +271,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
271271
// FIXME: Check MMO is atomic
272272
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
273273
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
274-
def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>;
275-
def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>;
276-
def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>;
277-
def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, atomic_load_fmax_glue>;
278-
274+
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
275+
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
279276

280277
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
281278
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -502,9 +502,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
502502

503503
// isa<MemSDNode> almost works but is slightly too permissive for some DS
504504
// intrinsics.
505-
if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
506-
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
507-
Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) {
505+
if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N)) {
508506
N = glueCopyToM0LDSInit(N);
509507
SelectCode(N);
510508
return;

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5524,8 +5524,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
55245524
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
55255525
NODE_NAME_CASE(DS_ORDERED_COUNT)
55265526
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5527-
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5528-
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
55295527
NODE_NAME_CASE(BUFFER_LOAD)
55305528
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
55315529
NODE_NAME_CASE(BUFFER_LOAD_USHORT)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -575,8 +575,6 @@ enum NodeType : unsigned {
575575
TBUFFER_LOAD_FORMAT_D16,
576576
DS_ORDERED_COUNT,
577577
ATOMIC_CMP_SWAP,
578-
ATOMIC_LOAD_FMIN,
579-
ATOMIC_LOAD_FMAX,
580578
BUFFER_LOAD,
581579
BUFFER_LOAD_UBYTE,
582580
BUFFER_LOAD_USHORT,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3620,8 +3620,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
36203620
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
36213621
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
36223622
case TargetOpcode::G_ATOMICRMW_FADD:
3623-
case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3624-
case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3623+
case TargetOpcode::G_ATOMICRMW_FMIN:
3624+
case TargetOpcode::G_ATOMICRMW_FMAX:
36253625
return selectG_LOAD_STORE_ATOMICRMW(I);
36263626
case TargetOpcode::G_SELECT:
36273627
return selectG_SELECT(I);

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,8 @@ defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>;
682682
defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>;
683683
defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>;
684684
defm atomic_load_fadd : binary_atomic_op_fp_all_as<atomic_load_fadd>;
685+
defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
686+
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
685687
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
686688
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
687689
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,9 @@ static const LLT S1 = LLT::scalar(1);
283283
static const LLT S8 = LLT::scalar(8);
284284
static const LLT S16 = LLT::scalar(16);
285285
static const LLT S32 = LLT::scalar(32);
286+
static const LLT F32 = LLT::float32();
286287
static const LLT S64 = LLT::scalar(64);
288+
static const LLT F64 = LLT::float64();
287289
static const LLT S96 = LLT::scalar(96);
288290
static const LLT S128 = LLT::scalar(128);
289291
static const LLT S160 = LLT::scalar(160);
@@ -1648,6 +1650,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
16481650
if (ST.hasFlatAtomicFaddF32Inst())
16491651
Atomic.legalFor({{S32, FlatPtr}});
16501652

1653+
getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1654+
.legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1655+
16511656
if (ST.hasGFX90AInsts()) {
16521657
// These are legal with some caveats, and should have undergone expansion in
16531658
// the IR in most situations
@@ -5401,9 +5406,9 @@ static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
54015406
case Intrinsic::amdgcn_ds_fadd:
54025407
return AMDGPU::G_ATOMICRMW_FADD;
54035408
case Intrinsic::amdgcn_ds_fmin:
5404-
return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5409+
return AMDGPU::G_ATOMICRMW_FMIN;
54055410
case Intrinsic::amdgcn_ds_fmax:
5406-
return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5411+
return AMDGPU::G_ATOMICRMW_FMAX;
54075412
default:
54085413
llvm_unreachable("not a DS FP intrinsic");
54095414
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5221,11 +5221,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
52215221
case AMDGPU::G_ATOMICRMW_UMAX:
52225222
case AMDGPU::G_ATOMICRMW_UMIN:
52235223
case AMDGPU::G_ATOMICRMW_FADD:
5224+
case AMDGPU::G_ATOMICRMW_FMIN:
5225+
case AMDGPU::G_ATOMICRMW_FMAX:
52245226
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
52255227
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
5226-
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
5227-
case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
5228-
case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
5228+
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
52295229
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
52305230
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
52315231
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -945,6 +945,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
945945
ISD::ATOMIC_LOAD_UMIN,
946946
ISD::ATOMIC_LOAD_UMAX,
947947
ISD::ATOMIC_LOAD_FADD,
948+
ISD::ATOMIC_LOAD_FMIN,
949+
ISD::ATOMIC_LOAD_FMAX,
948950
ISD::ATOMIC_LOAD_UINC_WRAP,
949951
ISD::ATOMIC_LOAD_UDEC_WRAP,
950952
ISD::INTRINSIC_VOID,
@@ -8707,25 +8709,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
87078709
case Intrinsic::amdgcn_ds_fmin:
87088710
case Intrinsic::amdgcn_ds_fmax: {
87098711
MemSDNode *M = cast<MemSDNode>(Op);
8710-
unsigned Opc;
8711-
switch (IntrID) {
8712-
case Intrinsic::amdgcn_ds_fmin:
8713-
Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
8714-
break;
8715-
case Intrinsic::amdgcn_ds_fmax:
8716-
Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
8717-
break;
8718-
default:
8719-
llvm_unreachable("Unknown intrinsic!");
8720-
}
8721-
SDValue Ops[] = {
8722-
M->getOperand(0), // Chain
8723-
M->getOperand(2), // Ptr
8724-
M->getOperand(3) // Value
8725-
};
8726-
8727-
return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
8728-
M->getMemoryVT(), M->getMemOperand());
8712+
unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
8713+
: ISD::ATOMIC_LOAD_FMAX;
8714+
return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
8715+
M->getOperand(2), M->getOperand(3),
8716+
M->getMemOperand());
87298717
}
87308718
case Intrinsic::amdgcn_raw_buffer_load:
87318719
case Intrinsic::amdgcn_raw_ptr_buffer_load:
@@ -9138,22 +9126,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
91389126
case Intrinsic::amdgcn_global_atomic_fmin_num:
91399127
case Intrinsic::amdgcn_flat_atomic_fmin:
91409128
case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9141-
Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
9129+
Opcode = ISD::ATOMIC_LOAD_FMIN;
91429130
break;
91439131
}
91449132
case Intrinsic::amdgcn_global_atomic_fmax:
91459133
case Intrinsic::amdgcn_global_atomic_fmax_num:
91469134
case Intrinsic::amdgcn_flat_atomic_fmax:
91479135
case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9148-
Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
9136+
Opcode = ISD::ATOMIC_LOAD_FMAX;
91499137
break;
91509138
}
91519139
default:
91529140
llvm_unreachable("unhandled atomic opcode");
91539141
}
9154-
return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
9155-
M->getVTList(), Ops, M->getMemoryVT(),
9156-
M->getMemOperand());
9142+
return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9143+
Ops, M->getMemOperand());
91579144
}
91589145
case Intrinsic::amdgcn_s_get_barrier_state: {
91599146
SDValue Chain = Op->getOperand(0);
@@ -15824,8 +15811,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
1582415811
case ISD::INTRINSIC_W_CHAIN:
1582515812
return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
1582615813
case AMDGPUISD::ATOMIC_CMP_SWAP:
15827-
case AMDGPUISD::ATOMIC_LOAD_FMIN:
15828-
case AMDGPUISD::ATOMIC_LOAD_FMAX:
1582915814
case AMDGPUISD::BUFFER_ATOMIC_SWAP:
1583015815
case AMDGPUISD::BUFFER_ATOMIC_ADD:
1583115816
case AMDGPUISD::BUFFER_ATOMIC_SUB:
@@ -16086,7 +16071,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1608616071
return AtomicExpansionKind::CmpXChg;
1608716072
}
1608816073
case AtomicRMWInst::FMin:
16089-
case AtomicRMWInst::FMax:
16074+
case AtomicRMWInst::FMax: {
16075+
Type *Ty = RMW->getType();
16076+
16077+
// LDS float and double fmin/fmax were always supported.
16078+
if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16079+
return AtomicExpansionKind::None;
16080+
16081+
return AtomicExpansionKind::CmpXChg;
16082+
}
1609016083
case AtomicRMWInst::Min:
1609116084
case AtomicRMWInst::Max:
1609216085
case AtomicRMWInst::UMin:

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,6 @@ def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
7272
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
7373
]>;
7474

75-
def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32,
76-
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
77-
>;
78-
79-
def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32,
80-
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
81-
>;
82-
8375
// load_d16_{lo|hi} ptr, tied_input
8476
def SIload_d16 : SDTypeProfile<1, 2, [
8577
SDTCisPtrTy<1>,
@@ -314,13 +306,6 @@ class isIntType<ValueType SrcVT> {
314306
bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
315307
}
316308

317-
//===----------------------------------------------------------------------===//
318-
// PatFrags for global memory operations
319-
//===----------------------------------------------------------------------===//
320-
321-
defm atomic_load_fmin : binary_atomic_op_fp_all_as<SIatomic_fmin>;
322-
defm atomic_load_fmax : binary_atomic_op_fp_all_as<SIatomic_fmax>;
323-
324309
//===----------------------------------------------------------------------===//
325310
// SDNodes PatFrags for loads/stores with a glue input.
326311
// This is for SDNodes and PatFrag for local loads and stores to
@@ -743,8 +728,8 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
743728
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
744729
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
745730
defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
746-
defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
747-
defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
731+
defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 0, SDTAtomic2_f32, 0>;
732+
defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 0, SDTAtomic2_f32, 0>;
748733

749734
def as_i1timm : SDNodeXForm<timm, [{
750735
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3863,11 +3863,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
38633863
let mayStore = 1;
38643864
}
38653865

3866-
let Namespace = "AMDGPU" in {
3867-
def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
3868-
def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
3869-
}
3870-
38713866
class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
38723867
let OutOperandList = (outs type0:$dst);
38733868
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -590,9 +590,7 @@ bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
590590
}
591591

592592
bool isGenericAtomic(unsigned Opc) {
593-
return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
594-
Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
595-
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
593+
return Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
596594
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD ||
597595
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB ||
598596
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN ||

llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,13 @@ body: |
9898
%2:_(s32) = IMPLICIT_DEF
9999
%3:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
100100
%4:_(s32) = G_CONSTANT i32 0
101+
%ptr_lds:_(p3) = G_IMPLICIT_DEF
101102
102-
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_ATOMIC_FMIN
103-
%5:_(s32) = G_AMDGPU_ATOMIC_FMIN %0, %3
103+
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMIN
104+
%5:_(s32) = G_ATOMICRMW_FMIN %ptr_lds, %4
104105
105-
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_ATOMIC_FMAX
106-
%6:_(s32) = G_AMDGPU_ATOMIC_FMAX %0, %3
106+
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMAX
107+
%6:_(s32) = G_ATOMICRMW_FMAX %ptr_lds, %4
107108
108109
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_AMDGPU_BUFFER_ATOMIC_SWAP
109110
%7:_(s32) = G_AMDGPU_BUFFER_ATOMIC_SWAP %0, %3, %4, %4, %4, 0, 0, 0 :: (volatile dereferenceable load store (s32), align 1, addrspace 8)

0 commit comments

Comments
 (0)