Skip to content

Prefetch uses host allocations due to page migration #1110

@pvelesko

Description

@pvelesko

When using zeMemAllocShared or clSharedMemAllocINTEL for managed memory, host access after GPU kernel execution causes segfaults when a SIGSEGV handler is installed.

Root cause:
Intel's driver uses page migration for shared allocations. After GPU accesses the memory, host page mappings are left in ---s state (no permissions). When host code accesses this memory:

  • Without signal handler: kernel handles the page fault, driver migrates data back, access succeeds
  • With signal handler (Catch2, debuggers, etc.): signal handler intercepts SIGSEGV before driver can handle it → crash

Workaround:
Use zeMemAllocHost/clHostMemAllocINTEL instead of shared allocations for managed memory. Host memory remains accessible from both host and device without migration issues.

Trade-off:
Performance penalty since data stays in system memory instead of migrating to device memory.

Affected tests:
Any test using hipMallocManaged followed by GPU kernel and host access, when run under Catch2 framework.

Reproducer:

#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <signal.h>
#include <setjmp.h>

static jmp_buf jump_buffer;
static volatile sig_atomic_t got_segfault = 0;

void segfault_handler(int sig) {
    got_segfault = 1;
    longjmp(jump_buffer, 1);
}

#define CHECK_CL(err, msg) do { \
    if (err != CL_SUCCESS) { \
        fprintf(stderr, "OpenCL Error %d at %s:%d - %s\n", err, __FILE__, __LINE__, msg); \
        exit(1); \
    } \
} while(0)

// Minimal kernel that just reads from input and writes to output
const char* kernelSource = R"(
__kernel void copy(__global int* output, __global const int* input, int N) {
    int gid = get_global_id(0);
    if (gid < N) {
        output[gid] = input[gid];
    }
}
)";

typedef void* (*clSharedMemAllocINTEL_fn)(cl_context, cl_device_id,
    const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef cl_int (*clMemFreeINTEL_fn)(cl_context, void*);

void print_mapping(void* ptr) {
    char cmd[256];
    snprintf(cmd, sizeof(cmd),
        "grep %lx /proc/%d/maps 2>/dev/null | head -1 || echo 'not found'",
        ((unsigned long)ptr) >> 12 << 12, getpid());
    system(cmd);
}

int main(int argc, char** argv) {
    int use_svm = (argc > 1 && strcmp(argv[1], "svm") == 0);

    printf("Intel OpenCL USM Shared Memory Bug Reproducer\n");
    printf("==============================================\n");
    printf("Allocation method: %s\n\n", use_svm ? "SVM (clSVMAlloc)" : "USM (clSharedMemAllocINTEL)");

    cl_int err;
    cl_platform_id platform = NULL;
    cl_device_id device = NULL;

    // Find Intel GPU
    cl_uint numPlatforms;
    clGetPlatformIDs(0, NULL, &numPlatforms);
    cl_platform_id* platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
    clGetPlatformIDs(numPlatforms, platforms, NULL);

    for (cl_uint i = 0; i < numPlatforms; i++) {
        char platName[256];
        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platName), platName, NULL);

        cl_uint numDevices;
        err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
        if (err == CL_SUCCESS && numDevices > 0) {
            platform = platforms[i];
            clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
            printf("Platform: %s\n", platName);
            break;
        }
    }
    free(platforms);

    if (!device) {
        fprintf(stderr, "No GPU found\n");
        return 1;
    }

    char deviceName[256], driverVersion[256];
    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
    clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(driverVersion), driverVersion, NULL);
    printf("Device: %s\n", deviceName);
    printf("Driver: %s\n\n", driverVersion);

    // Check for USM extension
    char extensions[8192];
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL);
    if (!use_svm && !strstr(extensions, "cl_intel_unified_shared_memory")) {
        fprintf(stderr, "USM extension not supported\n");
        return 1;
    }

    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CHECK_CL(err, "clCreateContext");

    cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
    CHECK_CL(err, "clCreateCommandQueue");

    // Build kernel
    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
    CHECK_CL(err, "clCreateProgramWithSource");
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
        char log[4096];
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), log, NULL);
        fprintf(stderr, "Build error: %s\n", log);
        return 1;
    }
    cl_kernel kernel = clCreateKernel(program, "copy", &err);
    CHECK_CL(err, "clCreateKernel");

    const int N = 8192;  // Larger allocation to ensure issue manifests
    const size_t size = N * sizeof(int);
    void* input = NULL;
    void* output = NULL;

    clSharedMemAllocINTEL_fn clSharedMemAllocINTEL = NULL;
    clMemFreeINTEL_fn clMemFreeINTEL = NULL;

    // Allocate memory
    printf("Step 1: Allocate memory\n");
    if (use_svm) {
        input = clSVMAlloc(context, CL_MEM_READ_WRITE, size, 0);
        output = clSVMAlloc(context, CL_MEM_READ_WRITE, size, 0);
        if (!input || !output) {
            fprintf(stderr, "SVM allocation failed\n");
            return 1;
        }
    } else {
        clSharedMemAllocINTEL = (clSharedMemAllocINTEL_fn)
            clGetExtensionFunctionAddressForPlatform(platform, "clSharedMemAllocINTEL");
        clMemFreeINTEL = (clMemFreeINTEL_fn)
            clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");

        input = clSharedMemAllocINTEL(context, device, NULL, size, 0, &err);
        CHECK_CL(err, "clSharedMemAllocINTEL (input)");
        output = clSharedMemAllocINTEL(context, device, NULL, size, 0, &err);
        CHECK_CL(err, "clSharedMemAllocINTEL (output)");
    }
    printf("   input=%p output=%p\n", input, output);
    printf("   Memory mapping: "); print_mapping(input);

    // Initialize from host - direct access for USM, mapped access for SVM
    printf("\nStep 2: Initialize memory from host\n");
    int* in = (int*)input;
    int* out = (int*)output;

    if (use_svm) {
        // SVM coarse-grain requires map/unmap
        clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, input, size, 0, NULL, NULL);
        clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, output, size, 0, NULL, NULL);
    }
    // USM shared memory can be accessed directly (that's the point of USM)
    for (int i = 0; i < N; i++) {
        in[i] = i;
        out[i] = 0;
    }
    if (use_svm) {
        clEnqueueSVMUnmap(queue, input, 0, NULL, NULL);
        clEnqueueSVMUnmap(queue, output, 0, NULL, NULL);
        clFinish(queue);
    }
    printf("   SUCCESS: Wrote %d elements, in[0]=%d\n", N, in[0]);
    printf("   Memory mapping: "); print_mapping(input);

    // Prefetch to device (like hipMemPrefetchAsync does)
    printf("\nStep 3: Prefetch to device\n");
    {
        const void* ptrs[] = {input};
        const size_t sizes[] = {size};
        cl_event ev;
        err = clEnqueueSVMMigrateMem(queue, 1, ptrs, sizes, 0 /*to device*/, 0, NULL, &ev);
        CHECK_CL(err, "clEnqueueSVMMigrateMem");
        clWaitForEvents(1, &ev);
        clReleaseEvent(ev);
        clFinish(queue);
    }
    printf("   Prefetch completed\n");
    printf("   Memory mapping: "); print_mapping(input);

    // Set kernel arguments and run
    printf("\nStep 4: Run kernel\n");
    if (use_svm) {
        void* ptrs[] = {input, output};
        clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_SVM_PTRS, sizeof(ptrs), ptrs);
    } else {
        void* ptrs[] = {input, output};
        clSetKernelExecInfo(kernel, 0x4203 /*CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL*/, sizeof(ptrs), ptrs);
        // Also set indirect access flag like chipStar does
        cl_bool indirect = CL_TRUE;
        clSetKernelExecInfo(kernel, 0x4202 /*CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL*/,
                           sizeof(cl_bool), &indirect);
    }
    clSetKernelArgSVMPointer(kernel, 0, output);
    clSetKernelArgSVMPointer(kernel, 1, input);
    clSetKernelArg(kernel, 2, sizeof(int), &N);

    size_t globalSize = N;
    cl_event ev;
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, &ev);
    CHECK_CL(err, "clEnqueueNDRangeKernel");
    clWaitForEvents(1, &ev);
    clReleaseEvent(ev);
    clFinish(queue);
    printf("   Kernel completed\n");
    printf("   Memory mapping: "); print_mapping(input);

    // Try to read from host - NO SVM MAP, just direct access like HIP managed memory
    printf("\nStep 5: Read memory from host (direct access, no clEnqueueSVMMap)\n");
    printf("   Memory mapping: "); print_mapping(input);

    // Set up segfault handler
    struct sigaction sa, old_sa;
    sa.sa_handler = segfault_handler;
    sigemptyset(&sa.sa_mask);
    sa.sa_flags = 0;
    sigaction(SIGSEGV, &sa, &old_sa);

    if (setjmp(jump_buffer) == 0) {
        printf("   Attempting to read input[0]... ");
        fflush(stdout);
        volatile int val = in[0];
        printf("SUCCESS: %d\n", val);
    } else {
        printf("SEGFAULT!\n");
    }

    // Restore handler
    sigaction(SIGSEGV, &old_sa, NULL);

    if (got_segfault) {
        printf("\n*** BUG REPRODUCED: USM memory inaccessible from host after kernel ***\n");
        return 1;
    }

    // Cleanup
    if (use_svm) {
        clSVMFree(context, input);
        clSVMFree(context, output);
    } else {
        clMemFreeINTEL(context, input);
        clMemFreeINTEL(context, output);
    }
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    printf("\nTEST PASSED\n");
    return 0;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions