34
34
#include < tuple>
35
35
#include < utility>
36
36
37
+ // The implicit arguments of COV5 AMDGPU kernels.
38
+ struct implicit_args_t {
39
+ uint32_t grid_size_x;
40
+ uint32_t grid_size_y;
41
+ uint32_t grid_size_z;
42
+ uint16_t workgroup_size_x;
43
+ uint16_t workgroup_size_y;
44
+ uint16_t workgroup_size_z;
45
+ uint8_t Unused0[46 ];
46
+ uint16_t grid_dims;
47
+ uint8_t Unused1[190 ];
48
+ };
49
+
37
50
// / Print the error code and exit if \p code indicates an error.
38
51
static void handle_error (hsa_status_t code) {
39
52
if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -185,11 +198,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
185
198
uint32_t args_size;
186
199
uint32_t group_size;
187
200
uint32_t private_size;
201
+ bool dynamic_stack;
188
202
189
203
std::pair<hsa_executable_symbol_info_t , void *> symbol_infos[] = {
190
204
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
191
205
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
192
206
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
207
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, &dynamic_stack},
193
208
{HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
194
209
195
210
for (auto &[info, value] : symbol_infos)
@@ -209,6 +224,19 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
209
224
std::memset (args, 0 , args_size);
210
225
std::memcpy (args, &kernel_args, sizeof (args_t ));
211
226
227
+ // Initialize the necessary implicit arguments to the proper values.
228
+ bool dims = 1 + (params.num_blocks_y * params.num_threads_y != 1 ) +
229
+ (params.num_blocks_z * params.num_threads_z != 1 );
230
+ implicit_args_t *implicit_args = reinterpret_cast <implicit_args_t *>(
231
+ reinterpret_cast <uint8_t *>(args) + sizeof (args_t ));
232
+ implicit_args->grid_dims = dims;
233
+ implicit_args->grid_size_x = params.num_threads_x ;
234
+ implicit_args->grid_size_y = params.num_threads_y ;
235
+ implicit_args->grid_size_z = params.num_threads_z ;
236
+ implicit_args->workgroup_size_x = params.num_blocks_x ;
237
+ implicit_args->workgroup_size_y = params.num_blocks_y ;
238
+ implicit_args->workgroup_size_z = params.num_blocks_z ;
239
+
212
240
// Obtain a packet from the queue.
213
241
uint64_t packet_id = hsa_queue_add_write_index_relaxed (queue, 1 );
214
242
while (packet_id - hsa_queue_load_read_index_scacquire (queue) >= queue->size )
@@ -222,17 +250,16 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
222
250
// Set up the packet for exeuction on the device. We currently only launch
223
251
// with one thread on the device, forcing the rest of the wavefront to be
224
252
// masked off.
225
- uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1 ) +
226
- (params.num_blocks_z * params.num_threads_z != 1 ))
227
- << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
253
+ uint16_t setup = (dims) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
228
254
packet->workgroup_size_x = params.num_threads_x ;
229
255
packet->workgroup_size_y = params.num_threads_y ;
230
256
packet->workgroup_size_z = params.num_threads_z ;
231
257
packet->reserved0 = 0 ;
232
258
packet->grid_size_x = params.num_blocks_x * params.num_threads_x ;
233
259
packet->grid_size_y = params.num_blocks_y * params.num_threads_y ;
234
260
packet->grid_size_z = params.num_blocks_z * params.num_threads_z ;
235
- packet->private_segment_size = private_size;
261
+ packet->private_segment_size =
262
+ dynamic_stack ? 16 * 1024 /* 16 KB */ : private_size;
236
263
packet->group_segment_size = group_size;
237
264
packet->kernel_object = kernel;
238
265
packet->kernarg_address = args;
0 commit comments