-
Notifications
You must be signed in to change notification settings - Fork 40
Description
When using zeMemAllocShared or clSharedMemAllocINTEL for managed memory, host access after GPU kernel execution causes segfaults when a SIGSEGV handler is installed.
Root cause:
Intel's driver uses page migration for shared allocations. After GPU accesses the memory, host page mappings are left in ---s state (no permissions). When host code accesses this memory:
- Without signal handler: kernel handles the page fault, driver migrates data back, access succeeds
- With signal handler (Catch2, debuggers, etc.): signal handler intercepts SIGSEGV before driver can handle it → crash
Workaround:
Use zeMemAllocHost/clHostMemAllocINTEL instead of shared allocations for managed memory. Host memory remains accessible from both host and device without migration issues.
Trade-off:
Performance penalty since data stays in system memory instead of migrating to device memory.
Affected tests:
Any test using hipMallocManaged followed by GPU kernel and host access, when run under Catch2 framework.
Reproducer:
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include <setjmp.h>
static jmp_buf jump_buffer;
static volatile sig_atomic_t got_segfault = 0;
void segfault_handler(int sig) {
got_segfault = 1;
longjmp(jump_buffer, 1);
}
#define CHECK_CL(err, msg) do { \
if (err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error %d at %s:%d - %s\n", err, __FILE__, __LINE__, msg); \
exit(1); \
} \
} while(0)
// Minimal kernel that just reads from input and writes to output
const char* kernelSource = R"(
__kernel void copy(__global int* output, __global const int* input, int N) {
int gid = get_global_id(0);
if (gid < N) {
output[gid] = input[gid];
}
}
)";
typedef void* (*clSharedMemAllocINTEL_fn)(cl_context, cl_device_id,
const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef cl_int (*clMemFreeINTEL_fn)(cl_context, void*);
void print_mapping(void* ptr) {
char cmd[256];
snprintf(cmd, sizeof(cmd),
"grep %lx /proc/%d/maps 2>/dev/null | head -1 || echo 'not found'",
((unsigned long)ptr) >> 12 << 12, getpid());
system(cmd);
}
int main(int argc, char** argv) {
int use_svm = (argc > 1 && strcmp(argv[1], "svm") == 0);
printf("Intel OpenCL USM Shared Memory Bug Reproducer\n");
printf("==============================================\n");
printf("Allocation method: %s\n\n", use_svm ? "SVM (clSVMAlloc)" : "USM (clSharedMemAllocINTEL)");
cl_int err;
cl_platform_id platform = NULL;
cl_device_id device = NULL;
// Find Intel GPU
cl_uint numPlatforms;
clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
clGetPlatformIDs(numPlatforms, platforms, NULL);
for (cl_uint i = 0; i < numPlatforms; i++) {
char platName[256];
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platName), platName, NULL);
cl_uint numDevices;
err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (err == CL_SUCCESS && numDevices > 0) {
platform = platforms[i];
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
printf("Platform: %s\n", platName);
break;
}
}
free(platforms);
if (!device) {
fprintf(stderr, "No GPU found\n");
return 1;
}
char deviceName[256], driverVersion[256];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(driverVersion), driverVersion, NULL);
printf("Device: %s\n", deviceName);
printf("Driver: %s\n\n", driverVersion);
// Check for USM extension
char extensions[8192];
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL);
if (!use_svm && !strstr(extensions, "cl_intel_unified_shared_memory")) {
fprintf(stderr, "USM extension not supported\n");
return 1;
}
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CHECK_CL(err, "clCreateContext");
cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
CHECK_CL(err, "clCreateCommandQueue");
// Build kernel
cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
CHECK_CL(err, "clCreateProgramWithSource");
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if (err != CL_SUCCESS) {
char log[4096];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), log, NULL);
fprintf(stderr, "Build error: %s\n", log);
return 1;
}
cl_kernel kernel = clCreateKernel(program, "copy", &err);
CHECK_CL(err, "clCreateKernel");
const int N = 8192; // Larger allocation to ensure issue manifests
const size_t size = N * sizeof(int);
void* input = NULL;
void* output = NULL;
clSharedMemAllocINTEL_fn clSharedMemAllocINTEL = NULL;
clMemFreeINTEL_fn clMemFreeINTEL = NULL;
// Allocate memory
printf("Step 1: Allocate memory\n");
if (use_svm) {
input = clSVMAlloc(context, CL_MEM_READ_WRITE, size, 0);
output = clSVMAlloc(context, CL_MEM_READ_WRITE, size, 0);
if (!input || !output) {
fprintf(stderr, "SVM allocation failed\n");
return 1;
}
} else {
clSharedMemAllocINTEL = (clSharedMemAllocINTEL_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clSharedMemAllocINTEL");
clMemFreeINTEL = (clMemFreeINTEL_fn)
clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");
input = clSharedMemAllocINTEL(context, device, NULL, size, 0, &err);
CHECK_CL(err, "clSharedMemAllocINTEL (input)");
output = clSharedMemAllocINTEL(context, device, NULL, size, 0, &err);
CHECK_CL(err, "clSharedMemAllocINTEL (output)");
}
printf(" input=%p output=%p\n", input, output);
printf(" Memory mapping: "); print_mapping(input);
// Initialize from host - direct access for USM, mapped access for SVM
printf("\nStep 2: Initialize memory from host\n");
int* in = (int*)input;
int* out = (int*)output;
if (use_svm) {
// SVM coarse-grain requires map/unmap
clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, input, size, 0, NULL, NULL);
clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, output, size, 0, NULL, NULL);
}
// USM shared memory can be accessed directly (that's the point of USM)
for (int i = 0; i < N; i++) {
in[i] = i;
out[i] = 0;
}
if (use_svm) {
clEnqueueSVMUnmap(queue, input, 0, NULL, NULL);
clEnqueueSVMUnmap(queue, output, 0, NULL, NULL);
clFinish(queue);
}
printf(" SUCCESS: Wrote %d elements, in[0]=%d\n", N, in[0]);
printf(" Memory mapping: "); print_mapping(input);
// Prefetch to device (like hipMemPrefetchAsync does)
printf("\nStep 3: Prefetch to device\n");
{
const void* ptrs[] = {input};
const size_t sizes[] = {size};
cl_event ev;
err = clEnqueueSVMMigrateMem(queue, 1, ptrs, sizes, 0 /*to device*/, 0, NULL, &ev);
CHECK_CL(err, "clEnqueueSVMMigrateMem");
clWaitForEvents(1, &ev);
clReleaseEvent(ev);
clFinish(queue);
}
printf(" Prefetch completed\n");
printf(" Memory mapping: "); print_mapping(input);
// Set kernel arguments and run
printf("\nStep 4: Run kernel\n");
if (use_svm) {
void* ptrs[] = {input, output};
clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_SVM_PTRS, sizeof(ptrs), ptrs);
} else {
void* ptrs[] = {input, output};
clSetKernelExecInfo(kernel, 0x4203 /*CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL*/, sizeof(ptrs), ptrs);
// Also set indirect access flag like chipStar does
cl_bool indirect = CL_TRUE;
clSetKernelExecInfo(kernel, 0x4202 /*CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL*/,
sizeof(cl_bool), &indirect);
}
clSetKernelArgSVMPointer(kernel, 0, output);
clSetKernelArgSVMPointer(kernel, 1, input);
clSetKernelArg(kernel, 2, sizeof(int), &N);
size_t globalSize = N;
cl_event ev;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, &ev);
CHECK_CL(err, "clEnqueueNDRangeKernel");
clWaitForEvents(1, &ev);
clReleaseEvent(ev);
clFinish(queue);
printf(" Kernel completed\n");
printf(" Memory mapping: "); print_mapping(input);
// Try to read from host - NO SVM MAP, just direct access like HIP managed memory
printf("\nStep 5: Read memory from host (direct access, no clEnqueueSVMMap)\n");
printf(" Memory mapping: "); print_mapping(input);
// Set up segfault handler
struct sigaction sa, old_sa;
sa.sa_handler = segfault_handler;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
sigaction(SIGSEGV, &sa, &old_sa);
if (setjmp(jump_buffer) == 0) {
printf(" Attempting to read input[0]... ");
fflush(stdout);
volatile int val = in[0];
printf("SUCCESS: %d\n", val);
} else {
printf("SEGFAULT!\n");
}
// Restore handler
sigaction(SIGSEGV, &old_sa, NULL);
if (got_segfault) {
printf("\n*** BUG REPRODUCED: USM memory inaccessible from host after kernel ***\n");
return 1;
}
// Cleanup
if (use_svm) {
clSVMFree(context, input);
clSVMFree(context, output);
} else {
clMemFreeINTEL(context, input);
clMemFreeINTEL(context, output);
}
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
printf("\nTEST PASSED\n");
return 0;
}