Skip to content

[libomptarget][nextgen-plugin] Use SCRELEASE/SCACQUIRE in packet header #85952

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion libc/utils/gpu/loader/amdgpu/Loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t header_word = header | (setup << 16u);
__atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
__atomic_store_n(reinterpret_cast<uint32_t *>(packet), header_word,
__ATOMIC_RELEASE);
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);

// Wait until the kernel has completed execution on the device. Periodically
Expand Down
4 changes: 2 additions & 2 deletions openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ typedef enum { HSA_FENCE_SCOPE_SYSTEM = 2 } hsa_fence_scope_t;

typedef enum {
HSA_PACKET_HEADER_TYPE = 0,
HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9,
HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9,
HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11
} hsa_packet_header_t;

typedef enum {
Expand Down
23 changes: 12 additions & 11 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -826,15 +826,15 @@ struct AMDGPUQueueTy {
/// Assumes the queue lock is acquired.
void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
hsa_kernel_dispatch_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);

uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
uint16_t Header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);

// Publish the packet. Do not modify the package after this point.
uint32_t HeaderWord = Header | (Setup << 16u);
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
__atomic_store_n(reinterpret_cast<uint32_t *>(Packet), HeaderWord,
__ATOMIC_RELEASE);

// Signal the doorbell about the published packet.
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
Expand All @@ -845,15 +845,16 @@ struct AMDGPUQueueTy {
/// barrier dependencies (signals) are satisfied. Assumes the queue is locked
void publishBarrierPacket(uint64_t PacketId,
hsa_barrier_and_packet_t *Packet) {
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
uint16_t Setup = 0;
uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
uint16_t Header =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);

// Publish the packet. Do not modify the package after this point.
uint32_t HeaderWord = Header | (Setup << 16u);
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
__atomic_store_n(reinterpret_cast<uint32_t *>(Packet), HeaderWord,
__ATOMIC_RELEASE);

// Signal the doorbell about the published packet.
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
Expand Down