Skip to content

[NVPTX] Add syncscope support for cmpxchg #140812

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -2319,13 +2319,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const;

virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const;
virtual Instruction *
emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const;

virtual Instruction *
emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const;
/// @}

// Emits code that executes when the comparison result in the ll/sc
Expand Down
18 changes: 13 additions & 5 deletions llvm/lib/CodeGen/AtomicExpandPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;

private:
bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
Expand Down Expand Up @@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {

if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
Expand All @@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
SSID = CASI->getSyncScopeID();

CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
// If CAS ordering is monotonic, then the operation will
// take default scope. Otherwise, it will retain its scope
if (CASOrdering != AtomicOrdering::Monotonic)
CASI->setSyncScopeID(SSID);
}

if (FenceOrdering != AtomicOrdering::Monotonic) {
MadeChange |= bracketInstWithFences(I, FenceOrdering);
MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
Expand Down Expand Up @@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}

bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
AtomicOrdering Order) {
AtomicOrdering Order,
SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);

auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);

auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
Expand Down
10 changes: 6 additions & 4 deletions llvm/lib/CodeGen/TargetLoweringBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2327,18 +2327,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,

Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
return Builder.CreateFence(Ord);
return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}

Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
return Builder.CreateFence(Ord);
return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21229,7 +21229,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
Expand All @@ -21254,7 +21255,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,

Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/ARM/ARMISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -666,10 +666,14 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;

Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *
emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const override;
Instruction *
emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const override;

unsigned getMaxSupportedInterleaveFactor() const override;

Expand Down
13 changes: 8 additions & 5 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6311,23 +6311,26 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(

Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);

// Specialize for cmpxchg
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
: Builder.CreateFence(AtomicOrdering::Release);
? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
SSID)
: Builder.CreateFence(AtomicOrdering::Release, SSID);

return nullptr;
}

Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
Expand All @@ -6340,7 +6343,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
return Builder.CreateFence(AtomicOrdering::Acquire);
return Builder.CreateFence(AtomicOrdering::Acquire, SSID);

return nullptr;
}
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;

Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *
emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const override;
Instruction *
emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const override;

unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;
Expand Down
58 changes: 46 additions & 12 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,27 @@ def AS_match {
}];
}

multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
}]>;
def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
}]>;
def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
}]>;
def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
}]>;
}


// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
Expand Down Expand Up @@ -2111,9 +2132,9 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}

// has 3 operands
multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
SDPatternOperator op, list<Predicate> preds> {
defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string scope_str, string as_str,
string op_str, SDPatternOperator op, list<Predicate> preds> {
defvar asm_str = "atom" # sem_str # scope_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def rr : NVPTXInst<(outs t.RC:$dst),
(ins ADDR:$addr, t.RC:$b, t.RC:$c),
Expand Down Expand Up @@ -2149,12 +2170,12 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}

multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string scope_str, string sem_str, string op_str, list<Predicate> preds = []> {
defvar frag_pat = (frag node:$a, node:$b, node:$c);
defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
defm _G : F_ATOMIC_3<t, sem_str, scope_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
defm _S : F_ATOMIC_3<t, sem_str, scope_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
defm _S_C : F_ATOMIC_3<t, sem_str, scope_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
defm _GEN : F_ATOMIC_3<t, sem_str, scope_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}

// atom_add
Expand Down Expand Up @@ -2205,18 +2226,30 @@ foreach t = [I32RT, I64RT] in {
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order);

// Instantiate scoped versions of the atomic compare and swap pattern
defm atomic_cmp_swap_i#t.Size#_#order: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;

foreach scope = ["cta", "cluster", "gpu", "sys"] in {
defvar atomic_cmp_swap_pat_scoped = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order#_#scope);

// Syncscope is only supported for SM70+
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_#scope
: F_ATOMIC_3_AS<t, atomic_cmp_swap_pat_scoped, "."#scope, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
}

// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#t.Size#_#order
: F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
: F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old
: F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>;
: F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "", "cas.b"#t.Size, []>;
}
}

// Note that 16-bit CAS support in PTX is emulated.
defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "", "cas.b16", [hasSM<70>, hasPTX<63>]>;

// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
Expand Down Expand Up @@ -2246,7 +2279,8 @@ multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr,
RegTyInfo t, list<Predicate> Preds> {
defm "" : F_ATOMIC_3<t,
as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
sem_str = "",
scope_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
op_str = OpStr # "." # TypeStr,
op = !cast<Intrinsic>(
"int_nvvm_atomic_" # OpStr
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12588,7 +12588,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
Expand All @@ -12598,7 +12599,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,

Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/PowerPC/PPCISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -927,10 +927,14 @@ namespace llvm {
return true;
}

Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *
emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const override;
Instruction *
emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const override;

bool shouldInlineQuadwordAtomics() const;

Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22800,7 +22800,8 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint(

Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
if (Subtarget.hasStdExtZtso()) {
if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
return Builder.CreateFence(Ord);
Expand All @@ -22816,7 +22817,8 @@ Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder,

Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
AtomicOrdering Ord,
SyncScope::ID SSID) const {
if (Subtarget.hasStdExtZtso()) {
if (isa<StoreInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
return Builder.CreateFence(Ord);
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,14 @@ class RISCVTargetLowering : public TargetLowering {
// than this hook due to limitations in the interface here.
bool shouldInsertFencesForAtomic(const Instruction *I) const override;

Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;
Instruction *
emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const override;
Instruction *
emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord,
SyncScope::ID SSID = SyncScope::System) const override;

bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const override;
Expand Down
Loading
Loading