Skip to content

Commit c134996

Browse files
committed
Merge branch 'release-2.3.0'
2 parents aa14760 + 6899a2f commit c134996

File tree

7 files changed

+50
-18
lines changed

7 files changed

+50
-18
lines changed

VERSION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
MAJOR = 2
22
MINOR = 3
3-
PATCH = 0-rc3
3+
PATCH = 0
44
# A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
55
# it is considered Development version.
6-
DATE = 2022-06-08
6+
DATE = 2022-06-26
77

88

Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
bokeh==1.0.4
22
matplotlib==3.0.2
3-
numpy==1.21.0
3+
numpy==1.22.0
44
pandas==0.23.4
55
pandas-profiling==1.4.1
66
seaborn==0.9.0
+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
numpy==1.21.0
1+
numpy==1.22.0

src/acc/opencl/acc_opencl.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ int c_dbcsr_acc_init(void) {
217217
char buffer[ACC_OPENCL_BUFFERSIZE];
218218
const char *const env_devmatch = getenv("ACC_OPENCL_DEVMATCH"), *const env_devtype = getenv("ACC_OPENCL_DEVTYPE");
219219
const char *const env_priority = getenv("ACC_OPENCL_PRIORITY"), *const env_xhints = getenv("ACC_OPENCL_XHINTS");
220-
const char *const env_nullify = getenv("ACC_OPENCL_NULLIFY"), *const env_dump_acc = getenv("ACC_OPENCL_DUMP");
220+
const char *const env_devcopy = getenv("ACC_OPENCL_DEVCOPY"), *const env_dump_acc = getenv("ACC_OPENCL_DUMP");
221221
const char *const env_verbose = getenv("ACC_OPENCL_VERBOSE"), *const env_flush = getenv("ACC_OPENCL_FLUSH");
222222
const char *const env_device = getenv("ACC_OPENCL_DEVICE"), *const env_timer = getenv("ACC_OPENCL_TIMER");
223223
const char *const env_share = getenv("ACC_OPENCL_SHARE"), *const env_async = getenv("ACC_OPENCL_ASYNC");
@@ -233,7 +233,7 @@ int c_dbcsr_acc_init(void) {
233233
# endif
234234
c_dbcsr_acc_opencl_config.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose));
235235
c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority));
236-
c_dbcsr_acc_opencl_config.nullify = (NULL == env_nullify ? /*default*/ 0 : atoi(env_nullify));
236+
c_dbcsr_acc_opencl_config.devcopy = (NULL == env_devcopy ? /*default*/ 0 : atoi(env_devcopy));
237237
c_dbcsr_acc_opencl_config.xhints = (NULL == env_xhints ? /*default*/ 1 : atoi(env_xhints));
238238
c_dbcsr_acc_opencl_config.share = (NULL == env_share ? /*default*/ 0 : atoi(env_share));
239239
c_dbcsr_acc_opencl_config.async = (NULL == env_async ? /*default*/ 3 : atoi(env_async));
@@ -557,7 +557,7 @@ int c_dbcsr_acc_finalize(void) {
557557
for (i = 0; i < ACC_OPENCL_DEVICES_MAXCOUNT; ++i) {
558558
const cl_device_id device_id = c_dbcsr_acc_opencl_config.devices[i];
559559
if (NULL != device_id) {
560-
# if defined(CL_VERSION_1_2)
560+
# if defined(CL_VERSION_1_2) && defined(_DEBUG)
561561
ACC_OPENCL_CHECK(clReleaseDevice(device_id), "release device", result);
562562
# endif
563563
/* c_dbcsr_acc_opencl_create_context scans for non-NULL devices */

src/acc/opencl/acc_opencl.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,8 @@ typedef struct c_dbcsr_acc_opencl_config_t {
223223
cl_int nthreads;
224224
/** How to apply/use stream priorities. */
225225
cl_int priority;
226-
/** How to zero device-side buffers. */
227-
cl_int nullify;
226+
/** How to zero/copy device-side buffers. */
227+
cl_int devcopy;
228228
/** Execution-hints (command stream). */
229229
cl_int xhints;
230230
/** Share streams across threads. */

src/acc/opencl/acc_opencl_mem.c

+38-6
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,9 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) {
224224
# endif
225225
clReleaseMemObject(buffer);
226226
# if defined(CL_VERSION_2_0)
227-
/*if (NULL != ptr)*/ clSVMFree(context, ptr);
227+
if (0 != c_dbcsr_acc_opencl_config.device[tid].svm_interop /*&& (NULL != ptr)*/) {
228+
clSVMFree(context, ptr);
229+
}
228230
# endif
229231
result = EXIT_FAILURE;
230232
}
@@ -271,8 +273,10 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) {
271273
}
272274
# endif
273275
# if defined(CL_VERSION_2_0)
274-
assert(NULL != c_dbcsr_acc_opencl_config.device[tid].context);
275-
clSVMFree(c_dbcsr_acc_opencl_config.device[tid].context, ptr); /*if (NULL != ptr)*/
276+
if (0 != c_dbcsr_acc_opencl_config.device[tid].svm_interop /*&& (NULL != ptr)*/) {
277+
assert(NULL != c_dbcsr_acc_opencl_config.device[tid].context);
278+
clSVMFree(c_dbcsr_acc_opencl_config.device[tid].context, ptr);
279+
}
276280
# endif
277281
}
278282
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
@@ -363,8 +367,36 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
363367
# endif
364368
assert((NULL != devmem_src || 0 == nbytes) && (NULL != devmem_dst || 0 == nbytes) && NULL != stream);
365369
if (NULL != devmem_src && NULL != devmem_dst && 0 != nbytes) {
366-
result = clEnqueueCopyBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(devmem_src), *ACC_OPENCL_MEM(devmem_dst),
367-
0 /*src_offset*/, 0 /*dst_offset*/, nbytes, 0, NULL, NULL);
370+
const cl_mem *const src = ACC_OPENCL_MEM(devmem_src), *const dst = ACC_OPENCL_MEM(devmem_dst);
371+
assert(NULL != *src && NULL != *dst);
372+
if (*src != *dst) {
373+
const cl_command_queue queue = *ACC_OPENCL_STREAM(stream);
374+
if (0 == (2 & c_dbcsr_acc_opencl_config.devcopy)) {
375+
result = clEnqueueCopyBuffer(queue, *src, *dst, 0 /*src_offset*/, 0 /*dst_offset*/, nbytes, 0, NULL, NULL);
376+
}
377+
else {
378+
static volatile int lock; /* creating cl_kernel and clSetKernelArg must be synchronized */
379+
static cl_kernel kernel = NULL;
380+
LIBXSMM_ATOMIC_ACQUIRE(&lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED);
381+
if (NULL == kernel) { /* generate kernel */
382+
const char source[] = "kernel void memcpy_d2d(global uchar *restrict src, global uchar *restrict dst) {\n"
383+
" const size_t i = get_global_id(0);\n"
384+
" dst[i] = src[i];\n"
385+
"}\n";
386+
result = c_dbcsr_acc_opencl_kernel(source, "memcpy_d2d" /*kernel_name*/, NULL /*build_params*/, NULL /*build_options*/,
387+
NULL /*try_build_options*/, NULL /*try_ok*/, NULL /*extnames*/, 0 /*num_exts*/, &kernel);
388+
}
389+
if (EXIT_SUCCESS == result) {
390+
assert(NULL != kernel);
391+
ACC_OPENCL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), src), "set src argument of memcpy_d2d kernel", result);
392+
ACC_OPENCL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), dst), "set dst argument of memcpy_d2d kernel", result);
393+
ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(
394+
queue, kernel, 1 /*work_dim*/, NULL /*offset*/, &nbytes, NULL /*local_work_size*/, 0, NULL, NULL),
395+
"launch memcpy_d2d kernel", result);
396+
}
397+
LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED);
398+
}
399+
}
368400
}
369401
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
370402
c_dbcsr_timestop(&routine_handle);
@@ -385,7 +417,7 @@ int c_dbcsr_acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* s
385417
if (0 != nbytes) {
386418
const cl_command_queue queue = *ACC_OPENCL_STREAM(stream);
387419
const cl_mem* const buffer = ACC_OPENCL_MEM(dev_mem);
388-
if (0 == c_dbcsr_acc_opencl_config.nullify) {
420+
if (0 == (1 & c_dbcsr_acc_opencl_config.devcopy)) {
389421
static const cl_uchar pattern = 0; /* fill with zeros */
390422
result = clEnqueueFillBuffer(queue, *buffer, &pattern, sizeof(pattern), offset, nbytes, 0, NULL, NULL);
391423
}

src/acc/opencl/smm/opencl_libsmm.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -891,8 +891,8 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
891891
clSetKernelArg(config->kernel, 1, sizeof(int), &offset), "set offset argument of transpose kernel", result);
892892
ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 2, sizeof(cl_mem), ACC_OPENCL_MEM(dev_data)),
893893
"set matrix-data argument of transpose kernel", result);
894-
ACC_OPENCL_CHECK(
895-
clEnqueueNDRangeKernel(queue, config->kernel, 1 /*work_dim*/, NULL, &work_size, &config->wgsize, 0, NULL, perf_event),
894+
ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(queue, config->kernel, 1 /*work_dim*/, NULL /*offset*/, &work_size, &config->wgsize,
895+
0, NULL, perf_event),
896896
"launch transpose kernel", result);
897897
/* eventually update performance counters inside of locked region */
898898
# if !defined(OPENCL_LIBSMM_VALIDATE_TRANS)
@@ -1635,7 +1635,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
16351635
ACC_OPENCL_CHECK(
16361636
clSetKernelArg(config->kernel[kernel_idx], 5, sizeof(int), &bs), "set minibatch argument of SMM-kernel", result);
16371637
}
1638-
ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(queue, config->kernel[kernel_idx], 1 /*work_dim*/, NULL, &work_size,
1638+
ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(queue, config->kernel[kernel_idx], 1 /*work_dim*/, NULL /*offset*/, &work_size,
16391639
config->wgsize + kernel_idx, 0, NULL, perf_event),
16401640
"launch SMM-kernel", result);
16411641
/* eventually update performance counters inside of locked region */

0 commit comments

Comments
 (0)