Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
- Packed
work-item Add product
IDs names.
- Workgroup
Clusters

=========== =============== ============ ===== ================= =============== =============== ======================

Expand Down Expand Up @@ -1095,6 +1097,22 @@ is conservatively correct for OpenCL.
- ``wavefront`` and executed by a thread in the
same wavefront.

``cluster`` Synchronizes with, and participates in modification
and seq_cst total orderings with, other operations
(except image operations) for all address spaces
(except private, or generic that accesses private)
provided the other operation's sync scope is:

- ``system``, ``agent`` or ``cluster`` and
executed by a thread on the same cluster.
- ``workgroup`` and executed by a thread in the
same work-group.
- ``wavefront`` and executed by a thread in the
same wavefront.

On targets that do not support workgroup cluster
launch mode, this behaves like ``agent`` scope instead.

``workgroup`` Synchronizes with, and participates in modification
and seq_cst total orderings with, other operations
(except image operations) for all address spaces
Expand Down Expand Up @@ -1128,6 +1146,9 @@ is conservatively correct for OpenCL.
``agent-one-as`` Same as ``agent`` but only synchronizes with other
operations within the same address space.

``cluster-one-as`` Same as ``cluster`` but only synchronizes with other
operations within the same address space.

``workgroup-one-as`` Same as ``workgroup`` but only synchronizes with
other operations within the same address space.

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
AgentSSID = CTX.getOrInsertSyncScopeID("agent");
WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
ClusterSSID = CTX.getOrInsertSyncScopeID("cluster");
SystemOneAddressSpaceSSID =
CTX.getOrInsertSyncScopeID("one-as");
AgentOneAddressSpaceSSID =
Expand All @@ -33,4 +34,5 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
CTX.getOrInsertSyncScopeID("wavefront-one-as");
SingleThreadOneAddressSpaceSSID =
CTX.getOrInsertSyncScopeID("singlethread-one-as");
ClusterOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("cluster-one-as");
}
20 changes: 17 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
SyncScope::ID WorkgroupSSID;
/// Wavefront synchronization scope ID (cross address space).
SyncScope::ID WavefrontSSID;
/// Cluster synchronization scope ID (cross address space).
SyncScope::ID ClusterSSID;
/// System synchronization scope ID (single address space).
SyncScope::ID SystemOneAddressSpaceSSID;
/// Agent synchronization scope ID (single address space).
Expand All @@ -42,6 +44,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
SyncScope::ID WavefrontOneAddressSpaceSSID;
/// Single thread synchronization scope ID (single address space).
SyncScope::ID SingleThreadOneAddressSpaceSSID;
/// Cluster synchronization scope ID (single address space).
SyncScope::ID ClusterOneAddressSpaceSSID;

/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
Expand All @@ -60,20 +64,24 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
else if (SSID == getWorkgroupSSID() ||
SSID == getWorkgroupOneAddressSpaceSSID())
return 2;
else if (SSID == getClusterSSID() ||
SSID == getClusterOneAddressSpaceSSID())
return 3;
else if (SSID == getAgentSSID() ||
SSID == getAgentOneAddressSpaceSSID())
return 3;
return 4;
else if (SSID == SyncScope::System ||
SSID == getSystemOneAddressSpaceSSID())
return 4;
return 5;

return std::nullopt;
}

/// \returns True if \p SSID is restricted to single address space, false
/// otherwise
bool isOneAddressSpace(SyncScope::ID SSID) const {
return SSID == getSingleThreadOneAddressSpaceSSID() ||
return SSID == getClusterOneAddressSpaceSSID() ||
SSID == getSingleThreadOneAddressSpaceSSID() ||
SSID == getWavefrontOneAddressSpaceSSID() ||
SSID == getWorkgroupOneAddressSpaceSSID() ||
SSID == getAgentOneAddressSpaceSSID() ||
Expand All @@ -95,6 +103,8 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
SyncScope::ID getWavefrontSSID() const {
return WavefrontSSID;
}
/// \returns Cluster synchronization scope ID (cross address space).
SyncScope::ID getClusterSSID() const { return ClusterSSID; }
/// \returns System synchronization scope ID (single address space).
SyncScope::ID getSystemOneAddressSpaceSSID() const {
return SystemOneAddressSpaceSSID;
Expand All @@ -115,6 +125,10 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
return SingleThreadOneAddressSpaceSSID;
}
/// \returns Single thread synchronization scope ID (single address space).
SyncScope::ID getClusterOneAddressSpaceSSID() const {
return ClusterOneAddressSpaceSSID;
}

/// In AMDGPU target synchronization scopes are inclusive, meaning a
/// larger synchronization scope is inclusive of a smaller synchronization
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1833,6 +1833,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return GFX1250Insts && getGeneration() == GFX12;
}

/// \returns true if the subtarget supports clusters of workgroups.
bool hasClusters() const { return GFX1250Insts; }

/// \returns true if the subtarget requires a wait for xcnt before atomic
/// flat/global stores & rmw.
bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; }
Expand Down
51 changes: 36 additions & 15 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ enum class SIAtomicScope {
SINGLETHREAD,
WAVEFRONT,
WORKGROUP,
CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
AGENT,
SYSTEM
};
Expand Down Expand Up @@ -106,6 +107,7 @@ class SIMemOpInfo final {
bool IsCooperative = false;

SIMemOpInfo(
const GCNSubtarget &ST,
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
Expand Down Expand Up @@ -156,6 +158,11 @@ class SIMemOpInfo final {
SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
this->Scope = std::min(Scope, SIAtomicScope::AGENT);
}

// On targets that have no concept of a workgroup cluster, use
// AGENT scope as a conservatively correct alternative.
if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
this->Scope = SIAtomicScope::AGENT;
}

public:
Expand Down Expand Up @@ -225,6 +232,7 @@ class SIMemOpInfo final {
class SIMemOpAccess final {
private:
const AMDGPUMachineModuleInfo *MMI = nullptr;
const GCNSubtarget &ST;

/// Reports unsupported message \p Msg for \p MI to LLVM context.
void reportUnsupported(const MachineBasicBlock::iterator &MI,
Expand All @@ -248,7 +256,7 @@ class SIMemOpAccess final {
public:
/// Construct class to support accessing the machine memory operands
/// of instructions in the machine function \p MF.
SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI);
SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);

/// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
std::optional<SIMemOpInfo>
Expand Down Expand Up @@ -773,6 +781,8 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
if (SSID == MMI->getAgentSSID())
return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
if (SSID == MMI->getClusterSSID())
return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
if (SSID == MMI->getWorkgroupSSID())
return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
true);
Expand All @@ -788,6 +798,9 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
if (SSID == MMI->getAgentOneAddressSpaceSSID())
return std::tuple(SIAtomicScope::AGENT,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
if (SSID == MMI->getClusterOneAddressSpaceSSID())
return std::tuple(SIAtomicScope::CLUSTER,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
return std::tuple(SIAtomicScope::WORKGROUP,
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
Expand Down Expand Up @@ -815,8 +828,9 @@ SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
return SIAtomicAddrSpace::OTHER;
}

SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_)
: MMI(&MMI_) {}
SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
const GCNSubtarget &ST)
: MMI(&MMI_), ST(ST) {}

std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
const MachineBasicBlock::iterator &MI) const {
Expand Down Expand Up @@ -877,7 +891,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
return std::nullopt;
}
}
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
IsNonTemporal, IsLastUse, IsCooperative);
}
Expand All @@ -891,7 +905,7 @@ SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {

// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
return SIMemOpInfo();
return SIMemOpInfo(ST);

return constructFromMIWithMMO(MI);
}
Expand All @@ -905,7 +919,7 @@ SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {

// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
return SIMemOpInfo();
return SIMemOpInfo(ST);

return constructFromMIWithMMO(MI);
}
Expand Down Expand Up @@ -946,8 +960,9 @@ SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
if (SynchronizeAS)
OrderingAddrSpace = *SynchronizeAS;

return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
AtomicOrdering::NotAtomic);
}

std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
Expand All @@ -959,7 +974,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(

// Be conservative if there are no memory operands.
if (MI->getNumMemOperands() == 0)
return SIMemOpInfo();
return SIMemOpInfo(ST);

return constructFromMIWithMMO(MI);
}
Expand Down Expand Up @@ -2377,6 +2392,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::CLUSTER:
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
LOADCnt |= true;
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
Expand Down Expand Up @@ -2413,6 +2429,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
switch (Scope) {
case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:
case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
// not needed as LDS operations for all waves are executed in a total
Expand Down Expand Up @@ -2495,6 +2512,9 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
case SIAtomicScope::AGENT:
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
break;
case SIAtomicScope::CLUSTER:
ScopeImm = AMDGPU::CPol::SCOPE_SE;
break;
case SIAtomicScope::WORKGROUP:
// GFX12.0:
// In WGP mode the waves of a work-group can be executing on either CU of
Expand Down Expand Up @@ -2565,6 +2585,7 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
.addImm(AMDGPU::CPol::SCOPE_DEV);
}
break;
case SIAtomicScope::CLUSTER:
case SIAtomicScope::WORKGROUP:
// No WB necessary, but we still have to wait.
break;
Expand Down Expand Up @@ -2649,11 +2670,8 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
const unsigned Scope = CPol->getImm() & CPol::SCOPE;

// GFX12.0 only: Extra waits needed before system scope stores.
if (!ST.hasGFX1250Insts()) {
if (!Atomic && Scope == CPol::SCOPE_SYS)
return insertWaitsBeforeSystemScopeStore(MI);
return Changed;
}
if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());

return Changed;
}
Expand Down Expand Up @@ -2684,6 +2702,9 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
case SIAtomicScope::AGENT:
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
break;
case SIAtomicScope::CLUSTER:
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
break;
case SIAtomicScope::WORKGROUP:
// In workgroup mode, SCOPE_SE is needed as waves can executes on
// different CUs that access different L0s.
Expand Down Expand Up @@ -2930,8 +2951,8 @@ SIMemoryLegalizerPass::run(MachineFunction &MF,
bool SIMemoryLegalizer::run(MachineFunction &MF) {
bool Changed = false;

SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>());
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
CC = SICacheControl::create(ST);

for (auto &MBB : MF) {
Expand Down
Loading