[libc] Update the AMDGPU implementation to use code object 5 (#72580)

jhuber6 · web-flow · commit 8341a40ec199 · 2023-11-21T07:14:10.000-06:00
Summary: This patch includes the necessary changes to make the `libc` tests running on AMD GPUs run using the newer code object version. The 'code object version' is AMD's internal ABI for making kernel calls. The move from 4 to 5 changed how we handle arguments for builtins such as obtaining the grid size or setting up the size of the private stack. Fixes: #72517
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -120,8 +120,5 @@ if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
   # The AMDGPU environment uses different code objects to encode the ABI for
   # kernel calls and intrinsic functions. We want to specify this manually to
   # conform to whatever the test suite was built to handle.
-  # FIXME: The test suite currently hangs when compiled targeting version five.
-  # This occurrs during traversal of the callback array in the startup code. We
-  # deliberately use version four until this can be addressed.
-  set(LIBC_GPU_CODE_OBJECT_VERSION 4)
+  set(LIBC_GPU_CODE_OBJECT_VERSION 5)
 endif()
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -34,6 +34,19 @@
 #include <tuple>
 #include <utility>
 
+// The implicit arguments of COV5 AMDGPU kernels.
+struct implicit_args_t {
+  uint32_t grid_size_x;
+  uint32_t grid_size_y;
+  uint32_t grid_size_z;
+  uint16_t workgroup_size_x;
+  uint16_t workgroup_size_y;
+  uint16_t workgroup_size_z;
+  uint8_t Unused0[46];
+  uint16_t grid_dims;
+  uint8_t Unused1[190];
+};
+
 /// Print the error code and exit if \p code indicates an error.
 static void handle_error(hsa_status_t code) {
   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -185,11 +198,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   uint32_t args_size;
   uint32_t group_size;
   uint32_t private_size;
+  bool dynamic_stack;
 
   std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
       {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
 
   for (auto &[info, value] : symbol_infos)
@@ -209,6 +224,19 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   std::memset(args, 0, args_size);
   std::memcpy(args, &kernel_args, sizeof(args_t));
 
+  // Initialize the necessary implicit arguments to the proper values.
+  bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
+              (params.num_blocks_z * params.num_threads_z != 1);
+  implicit_args_t *implicit_args = reinterpret_cast<implicit_args_t *>(
+      reinterpret_cast<uint8_t *>(args) + sizeof(args_t));
+  implicit_args->grid_dims = dims;
+  implicit_args->grid_size_x = params.num_threads_x;
+  implicit_args->grid_size_y = params.num_threads_y;
+  implicit_args->grid_size_z = params.num_threads_z;
+  implicit_args->workgroup_size_x = params.num_blocks_x;
+  implicit_args->workgroup_size_y = params.num_blocks_y;
+  implicit_args->workgroup_size_z = params.num_blocks_z;
+
   // Obtain a packet from the queue.
   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
   while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
@@ -222,17 +250,16 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   // Set up the packet for exeuction on the device. We currently only launch
   // with one thread on the device, forcing the rest of the wavefront to be
   // masked off.
-  uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
-                    (params.num_blocks_z * params.num_threads_z != 1))
-                   << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+  uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
   packet->workgroup_size_x = params.num_threads_x;
   packet->workgroup_size_y = params.num_threads_y;
   packet->workgroup_size_z = params.num_threads_z;
   packet->reserved0 = 0;
   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
-  packet->private_segment_size = private_size;
+  packet->private_segment_size =
+      dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
   packet->group_segment_size = group_size;
   packet->kernel_object = kernel;
   packet->kernarg_address = args;