Update examples (#217)

oleksandr-pavlyk · web-flow · commit 0e8d4fb0c2f1 · 2020-12-15T17:42:18.000+03:00
* Updated source codes to compile with oneAPI Gold and latest changes in dpctl

* sycl_direct_linkage: add README.md, fixed run.py

* added sycl_direct_linkage/bench.py, expanded README.md

* dress cython/sycl_direct_linkage/bench.py in black
diff --git a/examples/cython/sycl_direct_linkage/README.md b/examples/cython/sycl_direct_linkage/README.md
@@ -0,0 +1,57 @@
+# Example "sycl_direct_linkage"
+
+This Cython extension does not use dpCtl and links to SYCL directly.
+
+It exposes `columnwise_total` function that uses oneMKL to compute
+totals for each column of its argument matrix in double precision,
+expected as an ordinary NumPy array in C-contiguous layout.
+
+This functions performs the following steps:
+
+  1. Create a SYCL queue using default device selector
+  2. Creates SYCL buffer around the matrix data
+  3. Creates a vector `v_ones` with all elements being ones,
+     and allocates memory for the result.
+  4. Calls oneMKL to compute xGEMV, as dot(v_ones, M)
+  5. Returs the result as NumPy array
+
+This extension does not allow one to control the device/queue to
+which execution of kernel is being schedules.
+
+A related example "sycl_buffer" modifies this example in that it uses
+`dpCtl` to retrieve the current queue, allowing a user control the queue,
+and the avoid the overhead of the queue creation.
+
+To illustrate the queue creation overhead in each call, compare execution of default queue,
+which is Intel Gen9 GPU on OpenCL backend:
+
+```
+(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_BE=PI_OPENCL python bench.py
+========== Executing warm-up ==========
+NumPy result:  [1. 1. 1. ... 1. 1. 1.]
+SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.]
+Running time of 100 calls to columnwise_total on matrix with shape (10000, 4098)
+Times for default_selector, inclusive of queue creation:
+[19.384219504892826, 19.49932464491576, 19.613155928440392, 19.64031868893653, 19.752969074994326]
+Times for NumPy
+[3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887]
+```
+
+vs. timing when `dpctl`'s current queue is being reused:
+
+```
+(idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py
+========== Executing warm-up ==========
+NumPy result:  [1. 1. 1. ... 1. 1. 1.]
+SYCL(Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz) result: [1. 1. 1. ... 1. 1. 1.]
+SYCL(Intel(R) Graphics Gen9 [0x9bca]) result: [1. 1. 1. ... 1. 1. 1.]
+Times for 'opencl:cpu:0'
+[2.9164800881408155, 2.8714500251226127, 2.9770236839540303, 2.913622073829174, 2.7949972581118345]
+Times for 'opencl:gpu:0'
+[9.529508924111724, 10.288004886358976, 10.189113245811313, 10.197128206957132, 10.26169267296791]
+Times for NumPy
+[3.4809365631081164, 3.42917942116037, 3.42471009073779, 3.3689011191017926, 3.4336009239777923]
+```
+
+So the overhead of ``sycl::queue`` creation per call is roughly comparable with the time to
+execute the actual computation.
diff --git a/examples/cython/sycl_direct_linkage/bench.py b/examples/cython/sycl_direct_linkage/bench.py
@@ -0,0 +1,36 @@
+import dpctl
+import syclbuffer_naive as sb
+import numpy as np
+
+X = np.full((10 ** 4, 4098), 1e-4, dtype="d")
+
+# warm-up
+print("=" * 10 + " Executing warm-up " + "=" * 10)
+print("NumPy result: ", X.sum(axis=0))
+
+print(
+    "SYCL(default_device) result: {}".format(
+        sb.columnwise_total(X),
+    )
+)
+
+import timeit
+
+print(
+    "Running time of 100 calls to columnwise_total on matrix with shape {}".format(
+        X.shape
+    )
+)
+
+print("Times for default_selector, inclusive of queue creation:")
+print(
+    timeit.repeat(
+        stmt="sb.columnwise_total(X)",
+        setup="sb.columnwise_total(X)",  # ensure JIT compilation is not counted
+        number=100,
+        globals=globals(),
+    )
+)
+
+print("Times for NumPy")
+print(timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals()))
diff --git a/examples/cython/sycl_direct_linkage/run.py b/examples/cython/sycl_direct_linkage/run.py
@@ -1,4 +1,4 @@
-import syclbuffer as sb
+import syclbuffer_naive as sb
 import numpy as np
 
 X = np.random.randn(20, 10)
diff --git a/examples/cython/sycl_direct_linkage/sycl_function.cpp b/examples/cython/sycl_direct_linkage/sycl_function.cpp
@@ -1,6 +1,6 @@
 #include <CL/sycl.hpp>
 #include "sycl_function.hpp"
-#include "mkl_blas_sycl.hpp"
+#include <oneapi/mkl.hpp>
 #include "mkl.h"
 
 int c_columnwise_total(cl::sycl::queue &q, size_t n, size_t m, double *mat, double *ct) {
diff --git a/examples/cython/usm_memory/blackscholes.pyx b/examples/cython/usm_memory/blackscholes.pyx
@@ -2,7 +2,7 @@
 # distutils: language=c++
 
 cimport dpctl as c_dpctl
-cimport dpctl._memory as c_dpctl_mem
+cimport dpctl.memory as c_dpctl_mem
 cimport numpy as cnp
 from cython cimport floating
 
diff --git a/examples/cython/usm_memory/sycl_blackscholes.cpp b/examples/cython/usm_memory/sycl_blackscholes.cpp
@@ -1,7 +1,8 @@
 #include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include <oneapi/mkl/rng/device.hpp>
 #include "dpctl_sycl_types.h"
 #include "sycl_blackscholes.hpp"
-#include "mkl_rng_sycl_device.hpp"
 
 template<typename T>
 class black_scholes_kernel;

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import syclbuffer as sb`
	`1`	`+import syclbuffer_naive as sb`
`2`	`2`	`import numpy as np`
`3`	`3`
`4`	`4`	`X = np.random.randn(20, 10)`