Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU][SDAG] Only fold flat offsets if they are inbounds #132353

Open
wants to merge 1 commit into
base: users/ritter-x2a/03-19-_amdgpu_nfc_mark_geps_in_flat_offset_folding_tests_as_inbounds
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions llvm/include/llvm/CodeGen/SelectionDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -1069,23 +1069,27 @@ class SelectionDAG {
SDValue EVL);

/// Returns sum of the base pointer and offset.
/// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default.
/// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by
/// default.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL,
const SDNodeFlags Flags = SDNodeFlags());
SDValue getMemBasePlusOffset(SDValue Base, SDValue Offset, const SDLoc &DL,
const SDNodeFlags Flags = SDNodeFlags());

/// Create an add instruction with appropriate flags when used for
/// addressing some offset of an object. i.e. if a load is split into multiple
/// components, create an add nuw from the base pointer to the offset.
/// components, create an add nuw inbounds from the base pointer to the
/// offset.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) {
return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
return getMemBasePlusOffset(
Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
}

SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) {
// The object itself can't wrap around the address space, so it shouldn't be
// possible for the adds of the offsets to the split parts to overflow.
return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap);
return getMemBasePlusOffset(
Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds);
}

/// Return a new CALLSEQ_START node, that starts new call frame, in which
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1205,9 +1205,12 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,

if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) {
SDNodeFlags NewFlags;
if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
Flags.hasNoUnsignedWrap())
NewFlags |= SDNodeFlags::NoUnsignedWrap;
if (N0.getOpcode() == ISD::ADD) {
if (N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap())
NewFlags |= SDNodeFlags::NoUnsignedWrap;
if (N0->getFlags().hasInBounds() && Flags.hasInBounds())
NewFlags |= SDNodeFlags::InBounds;
}

if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
// Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
Expand Down
12 changes: 6 additions & 6 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8178,7 +8178,7 @@ static SDValue getMemcpyLoadsAndStores(
if (Value.getNode()) {
Store = DAG.getStore(
Chain, dl, Value,
DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
OutChains.push_back(Store);
}
Expand All @@ -8203,14 +8203,14 @@ static SDValue getMemcpyLoadsAndStores(

Value = DAG.getExtLoad(
ISD::EXTLOAD, dl, NVT, Chain,
DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
SrcPtrInfo.getWithOffset(SrcOff), VT,
commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo);
OutLoadChains.push_back(Value.getValue(1));

Store = DAG.getTruncStore(
Chain, dl, Value,
DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo);
OutStoreChains.push_back(Store);
}
Expand Down Expand Up @@ -8347,7 +8347,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,

Value = DAG.getLoad(
VT, dl, Chain,
DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo);
LoadValues.push_back(Value);
LoadChains.push_back(Value.getValue(1));
Expand All @@ -8362,7 +8362,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,

Store = DAG.getStore(
Chain, dl, LoadValues[i],
DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
OutChains.push_back(Store);
DstOff += VTSize;
Expand Down Expand Up @@ -8494,7 +8494,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
assert(Value.getValueType() == VT && "Value with wrong type.");
SDValue Store = DAG.getStore(
Chain, dl, Value,
DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment,
isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone,
NewAAInfo);
Expand Down
140 changes: 75 additions & 65 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1744,72 +1744,82 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
isFlatScratchBaseLegal(Addr))) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();

const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
Addr = N0;
OffsetVal = COffsetVal;
} else {
// If the offset doesn't fit, put the low bits into the offset field and
// add the rest.
//
// For a FLAT instruction the hardware decides whether to access
// global/scratch/shared memory based on the high bits of vaddr,
// ignoring the offset field, so we have to ensure that when we add
// remainder to vaddr it still points into the same underlying object.
// The easiest way to do that is to make sure that we split the offset
// into two pieces that are both >= 0 or both <= 0.

SDLoc DL(N);
uint64_t RemainderOffset;

std::tie(OffsetVal, RemainderOffset) =
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);

SDValue AddOffsetLo =
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);

if (Addr.getValueType().getSizeInBits() == 32) {
SmallVector<SDValue, 3> Opnds;
Opnds.push_back(N0);
Opnds.push_back(AddOffsetLo);
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
if (Subtarget->hasAddNoCarry()) {
AddOp = AMDGPU::V_ADD_U32_e64;
Opnds.push_back(Clamp);
}
Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
// Adding the offset to the base address in a FLAT instruction must not
// change the memory aperture in which the address falls. Therefore we can
// only fold offsets from inbounds GEPs into FLAT instructions.
bool IsInBounds = Addr->getFlags().hasInBounds();
if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) {
const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
Addr = N0;
OffsetVal = COffsetVal;
} else {
// TODO: Should this try to use a scalar add pseudo if the base address
// is uniform and saddr is usable?
SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);

SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
DL, MVT::i32, N0, Sub0);
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
DL, MVT::i32, N0, Sub1);

SDValue AddOffsetHi =
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);

SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);

SDNode *Add =
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});

SDNode *Addc = CurDAG->getMachineNode(
AMDGPU::V_ADDC_U32_e64, DL, VTs,
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});

SDValue RegSequenceArgs[] = {
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};

Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
MVT::i64, RegSequenceArgs),
0);
// If the offset doesn't fit, put the low bits into the offset field
// and add the rest.
//
// For a FLAT instruction the hardware decides whether to access
// global/scratch/shared memory based on the high bits of vaddr,
// ignoring the offset field, so we have to ensure that when we add
// remainder to vaddr it still points into the same underlying object.
// The easiest way to do that is to make sure that we split the offset
// into two pieces that are both >= 0 or both <= 0.

SDLoc DL(N);
uint64_t RemainderOffset;

std::tie(OffsetVal, RemainderOffset) =
TII->splitFlatOffset(COffsetVal, AS, FlatVariant);

SDValue AddOffsetLo =
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);

if (Addr.getValueType().getSizeInBits() == 32) {
SmallVector<SDValue, 3> Opnds;
Opnds.push_back(N0);
Opnds.push_back(AddOffsetLo);
unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
if (Subtarget->hasAddNoCarry()) {
AddOp = AMDGPU::V_ADD_U32_e64;
Opnds.push_back(Clamp);
}
Addr =
SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
} else {
// TODO: Should this try to use a scalar add pseudo if the base
// address is uniform and saddr is usable?
SDValue Sub0 =
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
SDValue Sub1 =
CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);

SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
DL, MVT::i32, N0, Sub0);
SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
DL, MVT::i32, N0, Sub1);

SDValue AddOffsetHi =
getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);

SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);

SDNode *Add =
CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
{AddOffsetLo, SDValue(N0Lo, 0), Clamp});

SDNode *Addc = CurDAG->getMachineNode(
AMDGPU::V_ADDC_U32_e64, DL, VTs,
{AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});

SDValue RegSequenceArgs[] = {
CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL,
MVT::i32),
SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};

Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
MVT::i64, RegSequenceArgs),
0);
}
}
}
}
Expand Down
Loading
Loading