Skip to content

Set default values for kernel image static part and skip CSR write if no change #349

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/acl_kernel_if.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ typedef struct {
// CRA address offset for backwards compatibility
unsigned int cra_address_offset = 8;

// Kernel static image cache for trackinig changed work dimensions, etc.
std::vector<std::unique_ptr<char[]>> static_img_cache;
// Kernel argument cache for trackinig changed arguments
std::vector<std::unique_ptr<char[]>> accel_arg_cache;
} acl_kernel_if;
Expand Down
15 changes: 14 additions & 1 deletion include/acl_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ class acl_device_program_info_t {
// don't expect it.
#pragma pack(push, 4)
// These are the bytes written to global memory for a kernel invocation.
typedef struct {
typedef struct acl_dev_kernel_invocation_image {
// The activation_id is the index into the device op queue.
// The value at acl_platform.device_op_queue[activation_id] will be
// updated asynchronously by the HAL, so its address must remain stable.
Expand Down Expand Up @@ -485,6 +485,19 @@ typedef struct {
char *arg_value;
size_t arg_value_size;

// Define constructor to initialize the invocation image to default values
// Hard code for now
acl_dev_kernel_invocation_image()
: activation_id(0), accel_id(0), work_dim(1), work_group_size(1),
padding(0), arg_value(NULL), arg_value_size(0) {
for (unsigned i = 0; i < 3; ++i) {
global_work_size[i] = 1;
num_groups[i] = 1;
local_work_size[i] = 1;
global_work_offset[i] = 0;
}
}

} acl_dev_kernel_invocation_image_t;

// Invocation image structure that matches the 18.1 CRA layout.
Expand Down
107 changes: 76 additions & 31 deletions src/acl_kernel_if.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,22 +97,23 @@ void acl_kernel_if_register_callbacks(
// **************************** Utility Functions ***************************
// **************************************************************************
void print_invocation_image(acl_kernel_if *kern, char *image_ptr,
size_t image_size, unsigned int offset,
bool is_static) {
size_t image_size, size_t size_to_write,
unsigned int csr_offset, bool is_static,
bool is_write = true, size_t print_offset = 0) {
std::string image_type = is_static ? "stat" : "args";
for (uintptr_t p = 0; p < image_size; p += sizeof(int)) {
std::string overwrite = is_write ? "Writing" : "Keeping";
size_t print_end = print_offset + size_to_write;
assert(print_end <= image_size && "printing invocation image out of bound");
for (uintptr_t p = print_offset; p < print_end; p += sizeof(int)) {
unsigned int pword = 0;
if (p + sizeof(int) > image_size) {
for (size_t i = 0; i < image_size - p; i += sizeof(char)) {
safe_memcpy(((char *)(&pword)) + i, image_ptr + p + i, sizeof(char),
sizeof(int), image_size - p - i);
}
} else {
pword = *(unsigned int *)(image_ptr + p);
}
uintptr_t cpy_size =
(print_end - p > sizeof(int)) ? sizeof(int) : (print_end - p);
safe_memcpy(((char *)(&pword)), image_ptr + p, cpy_size * sizeof(char),
sizeof(int), (print_end - p) * sizeof(char));
ACL_KERNEL_IF_DEBUG_MSG_VERBOSE(
kern, 2, ":: Writing inv image (%s) [%2d] @%8p := %4x\n",
image_type.c_str(), (int)(p), (void *)(offset + p), pword);
kern, 2, ":: %s inv image (%s) [%2d] @%8p := %4x\n",
overwrite.c_str(), image_type.c_str(), (int)(p),
(void *)(csr_offset + p), pword);
}
}

Expand Down Expand Up @@ -879,14 +880,23 @@ int acl_kernel_if_update(const acl_device_def_autodiscovery_t &devdef,
if (kern->num_accel > 0) {
kern->accel_job_ids.resize(kern->num_accel);
kern->accel_invoc_queue_depth.resize(kern->num_accel);
kern->static_img_cache.resize(kern->num_accel);
kern->accel_arg_cache.resize(kern->num_accel);

// Kernel IRQ is a separate thread. Need to use circular buffer to make this
// multithread safe.
kern->accel_queue_front.resize(kern->num_accel);
kern->accel_queue_back.resize(kern->num_accel);

acl_dev_kernel_invocation_image_t default_invocation;
size_t image_size_static =
(size_t)((uintptr_t) & (default_invocation.arg_value) - (uintptr_t) &
(default_invocation.work_dim));

for (unsigned a = 0; a < kern->num_accel; ++a) {
kern->static_img_cache[a] = std::make_unique<char[]>(image_size_static);
memcpy(kern->static_img_cache[a].get(),
(char *)(&(default_invocation.work_dim)), image_size_static);
unsigned int max_same_accel_launches =
devdef.accel[a].fast_launch_depth + 1;
// +1, because fast launch depth does not account for the running kernel
Expand Down Expand Up @@ -1134,37 +1144,55 @@ void acl_kernel_if_launch_kernel_on_custom_sof(
(image->work_dim));
}

if ((kern->io.debug_verbosity) >= 2) {
// We only write the static part of the invocation image if the kernel uses
// CRA control.
if (!kern->streaming_control_signal_names[accel_id]) {
print_invocation_image(kern, (char *)image_p, image_size_static, offset,
true);
}

if (kern->csr_version.has_value() &&
(kern->csr_version != CSR_VERSION_ID_18_1)) {
print_invocation_image(kern, image->arg_value, image->arg_value_size,
(unsigned int)(offset + image_size_static), false);
}
}

// When csr version is 18.1, the kernel args is part of the image. otherwise,
// it is in dynamic memory. Only write the static part of the invocation
// image if this kernel uses CRA control.
if (!kern->streaming_control_signal_names[accel_id]) {
acl_kernel_cra_write_block(kern, accel_id, offset, (unsigned int *)image_p,
image_size_static);
if (kern->csr_version == CSR_VERSION_ID_18_1) {
// Just write everything for older CSR version
if ((kern->io.debug_verbosity) >= 2) {
print_invocation_image(kern, (char *)image_p, image_size_static,
image_size_static, offset, true);
}
acl_kernel_cra_write_block(kern, accel_id, offset,
(unsigned int *)image_p, image_size_static);
} else {
char *img_cache_ptr = kern->static_img_cache[accel_id].get();
assert(img_cache_ptr && "kernel image cache not initialized!");
if (memcmp(img_cache_ptr, (char *)image_p, image_size_static) != 0) {
// Something changed in static part of the invocation image,
// write everything to csr
if ((kern->io.debug_verbosity) >= 2) {
print_invocation_image(kern, (char *)image_p, image_size_static,
image_size_static, offset, true);
}
acl_kernel_cra_write_block(kern, accel_id, offset,
(unsigned int *)image_p, image_size_static);
memcpy(img_cache_ptr, (char *)image_p, image_size_static);
} else if ((kern->io.debug_verbosity) >= 2) {
// Nothing's changed, just print the static part of the invocation image
print_invocation_image(kern, (char *)image_p, image_size_static,
image_size_static, offset, true, false);
}
}
}

bool accel_has_agent_args = false;
if (kern->csr_version.has_value() &&
(kern->csr_version != CSR_VERSION_ID_18_1 && image->arg_value_size > 0)) {
accel_has_agent_args = true;
if (!kern->accel_arg_cache[accel_id]) {
// The first time invoking the kernel, just write all the arguments
if ((kern->io.debug_verbosity) >= 2) {
print_invocation_image(kern, image->arg_value, image->arg_value_size,
image->arg_value_size,
(unsigned int)(offset + image_size_static),
false);
}
acl_kernel_cra_write_block(
kern, accel_id, offset + (unsigned int)image_size_static,
(unsigned int *)image->arg_value, image->arg_value_size);
// Initialize kernel argument cache and cache the values
kern->accel_arg_cache[accel_id] =
std::make_unique<char[]>(image->arg_value_size);
memcpy(kern->accel_arg_cache[accel_id].get(), (char *)image->arg_value,
Expand All @@ -1177,6 +1205,7 @@ void acl_kernel_if_launch_kernel_on_custom_sof(
size_t cmp_size = (image->arg_value_size - step) > sizeof(int)
? sizeof(int)
: (image->arg_value_size - step);
// Find range of changed arguments and record size of that block
while (cmp_size > 0 &&
memcmp(arg_cache_ptr + step + size_to_write,
image->arg_value + step + size_to_write, cmp_size) != 0) {
Expand All @@ -1187,8 +1216,23 @@ void acl_kernel_if_launch_kernel_on_custom_sof(
: (image->arg_value_size - step - size_to_write);
}
if (size_to_write == 0) {
step += (unsigned)sizeof(int);
// Current compared block is the same as before, skipping write
size_t size_to_skip = (image->arg_value_size - step > sizeof(int))
? sizeof(int)
: (image->arg_value_size - step);
if ((kern->io.debug_verbosity) >= 2) {
print_invocation_image(
kern, image->arg_value, image->arg_value_size, size_to_skip,
(unsigned int)(offset + image_size_static), false, false, step);
}
step += size_to_skip;
} else {
// Write the changed argument block to csr
if ((kern->io.debug_verbosity) >= 2) {
print_invocation_image(
kern, image->arg_value, image->arg_value_size, size_to_write,
(unsigned int)(offset + image_size_static), false, true, step);
}
acl_kernel_cra_write_block(
kern, accel_id, offset + (unsigned int)(image_size_static + step),
(unsigned int *)(image->arg_value + step), size_to_write);
Expand Down Expand Up @@ -1692,6 +1736,7 @@ void acl_kernel_if_close(acl_kernel_if *kern) {
kern->accel_invoc_queue_depth.clear();
kern->accel_queue_front.clear();
kern->accel_queue_back.clear();
kern->static_img_cache.clear();
kern->accel_arg_cache.clear();
kern->autorun_profiling_kernel_id = -1;
}
Expand Down