Skip to content

Commit 8341a40

Browse files
authored
[libc] Update the AMDGPU implementation to use code object 5 (#72580)
Summary: This patch includes the necessary changes to make the `libc` tests running on AMD GPUs run using the newer code object version. The 'code object version' is AMD's internal ABI for making kernel calls. The move from 4 to 5 changed how we handle arguments for builtins such as obtaining the grid size or setting up the size of the private stack. Fixes: #72517
1 parent 89efffd commit 8341a40

File tree

2 files changed

+32
-8
lines changed

2 files changed

+32
-8
lines changed

libc/cmake/modules/prepare_libc_gpu_build.cmake

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,5 @@ if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
120120
# The AMDGPU environment uses different code objects to encode the ABI for
121121
# kernel calls and intrinsic functions. We want to specify this manually to
122122
# conform to whatever the test suite was built to handle.
123-
# FIXME: The test suite currently hangs when compiled targeting version five.
124-
# This occurrs during traversal of the callback array in the startup code. We
125-
# deliberately use version four until this can be addressed.
126-
set(LIBC_GPU_CODE_OBJECT_VERSION 4)
123+
set(LIBC_GPU_CODE_OBJECT_VERSION 5)
127124
endif()

libc/utils/gpu/loader/amdgpu/Loader.cpp

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,19 @@
3434
#include <tuple>
3535
#include <utility>
3636

37+
// The implicit arguments of COV5 AMDGPU kernels.
38+
struct implicit_args_t {
39+
uint32_t grid_size_x;
40+
uint32_t grid_size_y;
41+
uint32_t grid_size_z;
42+
uint16_t workgroup_size_x;
43+
uint16_t workgroup_size_y;
44+
uint16_t workgroup_size_z;
45+
uint8_t Unused0[46];
46+
uint16_t grid_dims;
47+
uint8_t Unused1[190];
48+
};
49+
3750
/// Print the error code and exit if \p code indicates an error.
3851
static void handle_error(hsa_status_t code) {
3952
if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -185,11 +198,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
185198
uint32_t args_size;
186199
uint32_t group_size;
187200
uint32_t private_size;
201+
bool dynamic_stack;
188202

189203
std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
190204
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
191205
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
192206
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
207+
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
193208
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
194209

195210
for (auto &[info, value] : symbol_infos)
@@ -209,6 +224,19 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
209224
std::memset(args, 0, args_size);
210225
std::memcpy(args, &kernel_args, sizeof(args_t));
211226

227+
// Initialize the necessary implicit arguments to the proper values.
228+
bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
229+
(params.num_blocks_z * params.num_threads_z != 1);
230+
implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
231+
reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
232+
implicit_args->grid_dims = dims;
233+
implicit_args->grid_size_x = params.num_threads_x;
234+
implicit_args->grid_size_y = params.num_threads_y;
235+
implicit_args->grid_size_z = params.num_threads_z;
236+
implicit_args->workgroup_size_x = params.num_blocks_x;
237+
implicit_args->workgroup_size_y = params.num_blocks_y;
238+
implicit_args->workgroup_size_z = params.num_blocks_z;
239+
212240
// Obtain a packet from the queue.
213241
uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
214242
while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
@@ -222,17 +250,16 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
222250
// Set up the packet for exeuction on the device. We currently only launch
223251
// with one thread on the device, forcing the rest of the wavefront to be
224252
// masked off.
225-
uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
226-
(params.num_blocks_z * params.num_threads_z != 1))
227-
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
253+
uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
228254
packet->workgroup_size_x = params.num_threads_x;
229255
packet->workgroup_size_y = params.num_threads_y;
230256
packet->workgroup_size_z = params.num_threads_z;
231257
packet->reserved0 = 0;
232258
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
233259
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
234260
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
235-
packet->private_segment_size = private_size;
261+
packet->private_segment_size =
262+
dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
236263
packet->group_segment_size = group_size;
237264
packet->kernel_object = kernel;
238265
packet->kernarg_address = args;

0 commit comments

Comments
 (0)