Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GenericScanKernel problem with AMD Radeon Pro 5500M Apple Mac #809

Open
yves-surrel opened this issue Jan 12, 2025 · 0 comments
Open

GenericScanKernel problem with AMD Radeon Pro 5500M Apple Mac #809

yves-surrel opened this issue Jan 12, 2025 · 0 comments

Comments

@yves-surrel
Copy link

When trying the simple example(in the doc) of GenericScanKernel with my new MBP 16", it works on all devices (POCL, Intel HD Graphics, Apple cpu) except the last one (AMD Radeon Pro 5500M), with the error

SC failed. No reason given.
(options: -I /Users/yves/miniforge3/lib/python3.11/site-packages/pyopencl/cl)
(source saved as /var/folders/9t/b8p_snrs6_g21pk72_lkyjxw0000gn/T/tmpgxor7x8t.cl)

I reduced the source file to the smallest making the error to appear, to get:

src = """#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE);
#define WITHIN_KERNEL /* empty */
#define KERNEL __kernel
#define GLOBAL_MEM __global
#define LOCAL_MEM __local
#define LOCAL_MEM_ARG __local
#define REQD_WG_SIZE(X,Y,psc_Z) __attribute__((reqd_work_group_size(X, Y, psc_Z)))
#define psc_LID_0 ((ptrdiff_t) get_local_id(0))
#define psc_LID_1 ((ptrdiff_t) get_local_id(1))
#define psc_LID_2 ((ptrdiff_t) get_local_id(2))
#define psc_GID_0 ((ptrdiff_t) get_group_id(0))
#define psc_GID_1 ((ptrdiff_t) get_group_id(1))
#define psc_GID_2 ((ptrdiff_t) get_group_id(2))
#define psc_LDIM_0 ((ptrdiff_t) get_local_size(0))
#define psc_LDIM_1 ((ptrdiff_t) get_local_size(1))
#define psc_LDIM_2 ((ptrdiff_t) get_local_size(2))
#define psc_GDIM_0 ((ptrdiff_t) get_num_groups(0))
#define psc_GDIM_1 ((ptrdiff_t) get_num_groups(1))
#define psc_GDIM_2 ((ptrdiff_t) get_num_groups(2))
    #if __OPENCL_C_VERSION__ < 120
    #pragma OPENCL EXTENSION cl_khr_fp64: enable
    #endif
//CL//
#define psc_WG_SIZE 256
#define psc_SCAN_EXPR(a, b, across_seg_boundary)  across_seg_boundary || b.y & 0b10000000 || b.y & 0b00001000 ? b : b.z & 0b00001000 ? b : a 
#define psc_INPUT_EXPR(i) ((ushort4)(segmentsIn[i], patterns_i, patterns_im1, 0))
    #define psc_IS_SEG_START(i, a) (i % iImageWidth == 0)
typedef ushort4 psc_scan_type;
typedef int psc_index_type;
// NO_SEG_BOUNDARY is the largest representable integer in psc_index_type.
// This assumption is used in code below.
#define NO_SEG_BOUNDARY 2147483647
//CL//
#define psc_K 16
// #define psc_DEBUG
KERNEL
REQD_WG_SIZE(psc_WG_SIZE, 1, 1)
void scan_lev1(
    __global ushort *patterns, __global ushort *segmentsIn, __global ushort *segmentsOut, __global ushort *isOuterContour, __global ushort *contoursLUT, int iImageWidth,
    GLOBAL_MEM psc_scan_type *restrict psc_partial_scan_buffer,
    const psc_index_type N,
    const psc_index_type psc_interval_size
        , GLOBAL_MEM psc_scan_type *restrict psc_interval_results
        // NO_SEG_BOUNDARY if no segment boundary in interval.
        , GLOBAL_MEM psc_index_type *restrict psc_g_first_segment_start_in_interval
        , GLOBAL_MEM char *restrict psc_g_segment_start_flags
    )
{
    // index psc_K in first dimension used for psc_carry storage
        // Avoid bank conflicts by adding a single 32-bit psc_value to the size of
        // the scan type.
        struct __attribute__ ((__packed__)) psc_wrapped_scan_type
        {
            psc_scan_type psc_value;
            int psc_dummy;
        };
    // padded in psc_WG_SIZE to avoid bank conflicts
    LOCAL_MEM struct psc_wrapped_scan_type psc_ldata[psc_K + 1][psc_WG_SIZE + 1];
        LOCAL_MEM char psc_l_segment_start_flags[psc_K][psc_WG_SIZE];
        LOCAL_MEM psc_index_type psc_l_first_segment_start_in_subtree[psc_WG_SIZE];
        // only relevant/populated for local id 0
        psc_index_type psc_first_segment_start_in_interval = NO_SEG_BOUNDARY;
        psc_index_type psc_first_segment_start_in_k_group, psc_first_segment_start_in_subtree;
        LOCAL_MEM ushort psc_l_patterns[psc_WG_SIZE*psc_K];
    const psc_index_type psc_interval_begin = psc_interval_size * psc_GID_0;
    const psc_index_type psc_interval_end   = min(psc_interval_begin + psc_interval_size, N);
    const psc_index_type psc_unit_size  = psc_K * psc_WG_SIZE;
    psc_index_type psc_unit_base = psc_interval_begin;
            for(; psc_unit_base + psc_unit_size <= psc_interval_end; psc_unit_base += psc_unit_size)
        {
            // (if there are ones that need to be fetched into local)
                for(psc_index_type psc_k = 0; psc_k < psc_K; psc_k++)
                {const psc_index_type psc_offset = psc_k*psc_WG_SIZE + psc_LID_0;
                    const psc_index_type psc_read_i = psc_unit_base + psc_offset;
                        {
                            psc_l_patterns[psc_offset] = patterns[psc_read_i];
                        }
                }
                local_barrier();
            for(psc_index_type psc_k = 0; psc_k < psc_K; psc_k++)
            {
                const psc_index_type psc_offset = psc_k*psc_WG_SIZE + psc_LID_0;
                const psc_index_type psc_read_i = psc_unit_base + psc_offset;
                {
                        ushort patterns_im1;
                            if (psc_offset + -1 >= 0)
                                patterns_im1 = psc_l_patterns[psc_offset + -1];
                            else if (psc_read_i + -1 >= 0)
                                patterns_im1 = patterns[psc_read_i + -1];
                        ushort patterns_i;
                            if (psc_offset + 0 >= 0)
                                patterns_i = psc_l_patterns[psc_offset + 0];
                            else if (psc_read_i + 0 >= 0)
                                patterns_i = patterns[psc_read_i + 0];
                    psc_scan_type psc_scan_value = psc_INPUT_EXPR(psc_read_i);
                    const psc_index_type psc_o_mod_k = psc_offset % psc_K;
                    const psc_index_type psc_o_div_k = psc_offset / psc_K;
                    psc_ldata[psc_o_mod_k][psc_o_div_k].psc_value = psc_scan_value;
                }
            }
        }
}
"""
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '0'
import pyopencl as cl
platform = cl.get_platforms()[-1]
device = platform.get_devices()[-1]
ctx = cl.Context([device])
prg = cl.Program(ctx, src).build()

The offending line is the last one psc_ldata[psc_o_mod_k][psc_o_div_k].psc_value = psc_scan_value;.

FYI, it worked on my previous 15" MBP with AMD Radeon Pro 555.

Any idea?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant