Prefetch uses host allocations due to  page migration

When using `zeMemAllocShared` or `clSharedMemAllocINTEL` for managed memory, host access after GPU kernel execution causes segfaults when a SIGSEGV handler is installed.

**Root cause:**
Intel's driver uses page migration for shared allocations. After GPU accesses the memory, host page mappings are left in `---s` state (no permissions). When host code accesses this memory:
- Without signal handler: kernel handles the page fault, driver migrates data back, access succeeds
- With signal handler (Catch2, debuggers, etc.): signal handler intercepts SIGSEGV before driver can handle it → crash

**Workaround:**
Use `zeMemAllocHost`/`clHostMemAllocINTEL` instead of shared allocations for managed memory. Host memory remains accessible from both host and device without migration issues.

**Trade-off:**
Performance penalty since data stays in system memory instead of migrating to device memory.

**Affected tests:**
Any test using `hipMallocManaged` followed by GPU kernel and host access, when run under Catch2 framework.


Reproducer:


```
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include <setjmp.h>

static jmp_buf jump_buffer;
static volatile sig_atomic_t got_segfault = 0;

void segfault_handler(int sig) {
    got_segfault = 1;
    longjmp(jump_buffer, 1);
}

#define CHECK_CL(err, msg) do { \
    if (err != CL_SUCCESS) { \
        fprintf(stderr, "OpenCL Error %d at %s:%d - %s\n", err, __FILE__, __LINE__, msg); \
        exit(1); \
    } \
} while(0)

// Minimal kernel that just reads from input and writes to output
const char* kernelSource = R"(
__kernel void copy(__global int* output, __global const int* input, int N) {
    int gid = get_global_id(0);
    if (gid < N) {
        output[gid] = input[gid];
    }
}
)";

typedef void* (*clSharedMemAllocINTEL_fn)(cl_context, cl_device_id,
    const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef cl_int (*clMemFreeINTEL_fn)(cl_context, void*);

void print_mapping(void* ptr) {
    char cmd[256];
    snprintf(cmd, sizeof(cmd),
        "grep %lx /proc/%d/maps 2>/dev/null | head -1 || echo 'not found'",
        ((unsigned long)ptr) >> 12 << 12, getpid());
    system(cmd);
}

int main(int argc, char** argv) {
    int use_svm = (argc > 1 && strcmp(argv[1], "svm") == 0);

    printf("Intel OpenCL USM Shared Memory Bug Reproducer\n");
    printf("==============================================\n");
    printf("Allocation method: %s\n\n", use_svm ? "SVM (clSVMAlloc)" : "USM (clSharedMemAllocINTEL)");

    cl_int err;
    cl_platform_id platform = NULL;
    cl_device_id device = NULL;

    // Find Intel GPU
    cl_uint numPlatforms;
    clGetPlatformIDs(0, NULL, &numPlatforms);
    cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
    clGetPlatformIDs(numPlatforms, platforms, NULL);

    for (cl_uint i = 0; i < numPlatforms; i++) {
        char platName[256];
        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platName), platName, NULL);

        cl_uint numDevices;
        err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
        if (err == CL_SUCCESS && numDevices > 0) {
            platform = platforms[i];
            clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
            printf("Platform: %s\n", platName);
            break;
        }
    }
    free(platforms);

    if (!device) {
        fprintf(stderr, "No GPU found\n");
        return 1;
    }

    char deviceName[256], driverVersion[256];
    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
    clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(driverVersion), driverVersion, NULL);
    printf("Device: %s\n", deviceName);
    printf("Driver: %s\n\n", driverVersion);

    // Check for USM extension
    char extensions[8192];
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL);
    if (!use_svm && !strstr(extensions, "cl_intel_unified_shared_memory")) {
        fprintf(stderr, "USM extension not supported\n");
        return 1;
    }

    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CHECK_CL(err, "clCreateContext");

    cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
    CHECK_CL(err, "clCreateCommandQueue");

    // Build kernel
    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
    CHECK_CL(err, "clCreateProgramWithSource");
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
        char log[4096];
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), log, NULL);
        fprintf(stderr, "Build error: %s\n", log);
        return 1;
    }
    cl_kernel kernel = clCreateKernel(program, "copy", &err);
    CHECK_CL(err, "clCreateKernel");

    const int N = 8192;  // Larger allocation to ensure issue manifests
    const size_t size = N * sizeof(int);
    void* input = NULL;
    void* output = NULL;

    clSharedMemAllocINTEL_fn clSharedMemAllocINTEL = NULL;
    clMemFreeINTEL_fn clMemFreeINTEL = NULL;

    // Allocate memory
    printf("Step 1: Allocate memory\n");
    if (use_svm) {
        input = clSVMAlloc(context, CL_MEM_READ_WRITE, size, 0);
        output = clSVMAlloc(context, CL_MEM_READ_WRITE, size, 0);
        if (!input || !output) {
            fprintf(stderr, "SVM allocation failed\n");
            return 1;
        }
    } else {
        clSharedMemAllocINTEL = (clSharedMemAllocINTEL_fn)
            clGetExtensionFunctionAddressForPlatform(platform, "clSharedMemAllocINTEL");
        clMemFreeINTEL = (clMemFreeINTEL_fn)
            clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");

        input = clSharedMemAllocINTEL(context, device, NULL, size, 0, &err);
        CHECK_CL(err, "clSharedMemAllocINTEL (input)");
        output = clSharedMemAllocINTEL(context, device, NULL, size, 0, &err);
        CHECK_CL(err, "clSharedMemAllocINTEL (output)");
    }
    printf("   input=%p output=%p\n", input, output);
    printf("   Memory mapping: "); print_mapping(input);

    // Initialize from host - direct access for USM, mapped access for SVM
    printf("\nStep 2: Initialize memory from host\n");
    int* in = (int*)input;
    int* out = (int*)output;

    if (use_svm) {
        // SVM coarse-grain requires map/unmap
        clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, input, size, 0, NULL, NULL);
        clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, output, size, 0, NULL, NULL);
    }
    // USM shared memory can be accessed directly (that's the point of USM)
    for (int i = 0; i < N; i++) {
        in[i] = i;
        out[i] = 0;
    }
    if (use_svm) {
        clEnqueueSVMUnmap(queue, input, 0, NULL, NULL);
        clEnqueueSVMUnmap(queue, output, 0, NULL, NULL);
        clFinish(queue);
    }
    printf("   SUCCESS: Wrote %d elements, in[0]=%d\n", N, in[0]);
    printf("   Memory mapping: "); print_mapping(input);

    // Prefetch to device (like hipMemPrefetchAsync does)
    printf("\nStep 3: Prefetch to device\n");
    {
        const void* ptrs[] = {input};
        const size_t sizes[] = {size};
        cl_event ev;
        err = clEnqueueSVMMigrateMem(queue, 1, ptrs, sizes, 0 /*to device*/, 0, NULL, &ev);
        CHECK_CL(err, "clEnqueueSVMMigrateMem");
        clWaitForEvents(1, &ev);
        clReleaseEvent(ev);
        clFinish(queue);
    }
    printf("   Prefetch completed\n");
    printf("   Memory mapping: "); print_mapping(input);

    // Set kernel arguments and run
    printf("\nStep 4: Run kernel\n");
    if (use_svm) {
        void* ptrs[] = {input, output};
        clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_SVM_PTRS, sizeof(ptrs), ptrs);
    } else {
        void* ptrs[] = {input, output};
        clSetKernelExecInfo(kernel, 0x4203 /*CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL*/, sizeof(ptrs), ptrs);
        // Also set indirect access flag like chipStar does
        cl_bool indirect = CL_TRUE;
        clSetKernelExecInfo(kernel, 0x4202 /*CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL*/,
                           sizeof(cl_bool), &indirect);
    }
    clSetKernelArgSVMPointer(kernel, 0, output);
    clSetKernelArgSVMPointer(kernel, 1, input);
    clSetKernelArg(kernel, 2, sizeof(int), &N);

    size_t globalSize = N;
    cl_event ev;
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, &ev);
    CHECK_CL(err, "clEnqueueNDRangeKernel");
    clWaitForEvents(1, &ev);
    clReleaseEvent(ev);
    clFinish(queue);
    printf("   Kernel completed\n");
    printf("   Memory mapping: "); print_mapping(input);

    // Try to read from host - NO SVM MAP, just direct access like HIP managed memory
    printf("\nStep 5: Read memory from host (direct access, no clEnqueueSVMMap)\n");
    printf("   Memory mapping: "); print_mapping(input);

    // Set up segfault handler
    struct sigaction sa, old_sa;
    sa.sa_handler = segfault_handler;
    sigemptyset(&sa.sa_mask);
    sa.sa_flags = 0;
    sigaction(SIGSEGV, &sa, &old_sa);

    if (setjmp(jump_buffer) == 0) {
        printf("   Attempting to read input[0]... ");
        fflush(stdout);
        volatile int val = in[0];
        printf("SUCCESS: %d\n", val);
    } else {
        printf("SEGFAULT!\n");
    }

    // Restore handler
    sigaction(SIGSEGV, &old_sa, NULL);

    if (got_segfault) {
        printf("\n*** BUG REPRODUCED: USM memory inaccessible from host after kernel ***\n");
        return 1;
    }

    // Cleanup
    if (use_svm) {
        clSVMFree(context, input);
        clSVMFree(context, output);
    } else {
        clMemFreeINTEL(context, input);
        clMemFreeINTEL(context, output);
    }
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    printf("\nTEST PASSED\n");
    return 0;
}
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Prefetch uses host allocations due to page migration #1110

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Prefetch uses host allocations due to page migration #1110

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions