You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When trying the simple example(in the doc) of GenericScanKernel with my new MBP 16", it works on all devices (POCL, Intel HD Graphics, Apple cpu) except the last one (AMD Radeon Pro 5500M), with the error
SC failed. No reason given.
(options: -I /Users/yves/miniforge3/lib/python3.11/site-packages/pyopencl/cl)
(source saved as /var/folders/9t/b8p_snrs6_g21pk72_lkyjxw0000gn/T/tmpgxor7x8t.cl)
I reduced the source file to the smallest making the error to appear, to get:
src = """#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE);
#define WITHIN_KERNEL /* empty */
#define KERNEL __kernel
#define GLOBAL_MEM __global
#define LOCAL_MEM __local
#define LOCAL_MEM_ARG __local
#define REQD_WG_SIZE(X,Y,psc_Z) __attribute__((reqd_work_group_size(X, Y, psc_Z)))
#define psc_LID_0 ((ptrdiff_t) get_local_id(0))
#define psc_LID_1 ((ptrdiff_t) get_local_id(1))
#define psc_LID_2 ((ptrdiff_t) get_local_id(2))
#define psc_GID_0 ((ptrdiff_t) get_group_id(0))
#define psc_GID_1 ((ptrdiff_t) get_group_id(1))
#define psc_GID_2 ((ptrdiff_t) get_group_id(2))
#define psc_LDIM_0 ((ptrdiff_t) get_local_size(0))
#define psc_LDIM_1 ((ptrdiff_t) get_local_size(1))
#define psc_LDIM_2 ((ptrdiff_t) get_local_size(2))
#define psc_GDIM_0 ((ptrdiff_t) get_num_groups(0))
#define psc_GDIM_1 ((ptrdiff_t) get_num_groups(1))
#define psc_GDIM_2 ((ptrdiff_t) get_num_groups(2))
#if __OPENCL_C_VERSION__ < 120
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
//CL//
#define psc_WG_SIZE 256
#define psc_SCAN_EXPR(a, b, across_seg_boundary) across_seg_boundary || b.y & 0b10000000 || b.y & 0b00001000 ? b : b.z & 0b00001000 ? b : a
#define psc_INPUT_EXPR(i) ((ushort4)(segmentsIn[i], patterns_i, patterns_im1, 0))
#define psc_IS_SEG_START(i, a) (i % iImageWidth == 0)
typedef ushort4 psc_scan_type;
typedef int psc_index_type;
// NO_SEG_BOUNDARY is the largest representable integer in psc_index_type.
// This assumption is used in code below.
#define NO_SEG_BOUNDARY 2147483647
//CL//
#define psc_K 16
// #define psc_DEBUG
KERNEL
REQD_WG_SIZE(psc_WG_SIZE, 1, 1)
void scan_lev1(
__global ushort *patterns, __global ushort *segmentsIn, __global ushort *segmentsOut, __global ushort *isOuterContour, __global ushort *contoursLUT, int iImageWidth,
GLOBAL_MEM psc_scan_type *restrict psc_partial_scan_buffer,
const psc_index_type N,
const psc_index_type psc_interval_size
, GLOBAL_MEM psc_scan_type *restrict psc_interval_results
// NO_SEG_BOUNDARY if no segment boundary in interval.
, GLOBAL_MEM psc_index_type *restrict psc_g_first_segment_start_in_interval
, GLOBAL_MEM char *restrict psc_g_segment_start_flags
)
{
// index psc_K in first dimension used for psc_carry storage
// Avoid bank conflicts by adding a single 32-bit psc_value to the size of
// the scan type.
struct __attribute__ ((__packed__)) psc_wrapped_scan_type
{
psc_scan_type psc_value;
int psc_dummy;
};
// padded in psc_WG_SIZE to avoid bank conflicts
LOCAL_MEM struct psc_wrapped_scan_type psc_ldata[psc_K + 1][psc_WG_SIZE + 1];
LOCAL_MEM char psc_l_segment_start_flags[psc_K][psc_WG_SIZE];
LOCAL_MEM psc_index_type psc_l_first_segment_start_in_subtree[psc_WG_SIZE];
// only relevant/populated for local id 0
psc_index_type psc_first_segment_start_in_interval = NO_SEG_BOUNDARY;
psc_index_type psc_first_segment_start_in_k_group, psc_first_segment_start_in_subtree;
LOCAL_MEM ushort psc_l_patterns[psc_WG_SIZE*psc_K];
const psc_index_type psc_interval_begin = psc_interval_size * psc_GID_0;
const psc_index_type psc_interval_end = min(psc_interval_begin + psc_interval_size, N);
const psc_index_type psc_unit_size = psc_K * psc_WG_SIZE;
psc_index_type psc_unit_base = psc_interval_begin;
for(; psc_unit_base + psc_unit_size <= psc_interval_end; psc_unit_base += psc_unit_size)
{
// (if there are ones that need to be fetched into local)
for(psc_index_type psc_k = 0; psc_k < psc_K; psc_k++)
{const psc_index_type psc_offset = psc_k*psc_WG_SIZE + psc_LID_0;
const psc_index_type psc_read_i = psc_unit_base + psc_offset;
{
psc_l_patterns[psc_offset] = patterns[psc_read_i];
}
}
local_barrier();
for(psc_index_type psc_k = 0; psc_k < psc_K; psc_k++)
{
const psc_index_type psc_offset = psc_k*psc_WG_SIZE + psc_LID_0;
const psc_index_type psc_read_i = psc_unit_base + psc_offset;
{
ushort patterns_im1;
if (psc_offset + -1 >= 0)
patterns_im1 = psc_l_patterns[psc_offset + -1];
else if (psc_read_i + -1 >= 0)
patterns_im1 = patterns[psc_read_i + -1];
ushort patterns_i;
if (psc_offset + 0 >= 0)
patterns_i = psc_l_patterns[psc_offset + 0];
else if (psc_read_i + 0 >= 0)
patterns_i = patterns[psc_read_i + 0];
psc_scan_type psc_scan_value = psc_INPUT_EXPR(psc_read_i);
const psc_index_type psc_o_mod_k = psc_offset % psc_K;
const psc_index_type psc_o_div_k = psc_offset / psc_K;
psc_ldata[psc_o_mod_k][psc_o_div_k].psc_value = psc_scan_value;
}
}
}
}
"""
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '0'
import pyopencl as cl
platform = cl.get_platforms()[-1]
device = platform.get_devices()[-1]
ctx = cl.Context([device])
prg = cl.Program(ctx, src).build()
The offending line is the last one psc_ldata[psc_o_mod_k][psc_o_div_k].psc_value = psc_scan_value;.
FYI, it worked on my previous 15" MBP with AMD Radeon Pro 555.
Any idea?
The text was updated successfully, but these errors were encountered:
When trying the simple example(in the doc) of GenericScanKernel with my new MBP 16", it works on all devices (POCL, Intel HD Graphics, Apple cpu) except the last one (AMD Radeon Pro 5500M), with the error
I reduced the source file to the smallest making the error to appear, to get:
The offending line is the last one
psc_ldata[psc_o_mod_k][psc_o_div_k].psc_value = psc_scan_value;
.FYI, it worked on my previous 15" MBP with AMD Radeon Pro 555.
Any idea?
The text was updated successfully, but these errors were encountered: