Skip to content

Commit d157ee0

Browse files
committed
Merge branch 'master' into gold/2021
2 parents 97cc81f + 0e8d4fb commit d157ee0

File tree

7 files changed

+119
-14
lines changed

7 files changed

+119
-14
lines changed

dpctl/dptensor/numpy_usm_shared.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
##===---------- dparray.py - dpctl -------*- Python -*----===##
1+
##===---------- numpy_usm_shared.py - dpctl -------*- Python -*----===##
22
##
33
## Data Parallel Control (dpCtl)
44
##
@@ -19,7 +19,7 @@
1919
##===----------------------------------------------------------------------===##
2020
###
2121
### \file
22-
### This file implements a dparray - USM aware implementation of ndarray.
22+
### This file implements a numpy_usm_shared - USM aware implementation of ndarray.
2323
##===----------------------------------------------------------------------===##
2424

2525
import numpy as np
@@ -70,12 +70,17 @@ class ndarray(np.ndarray):
7070
with a foreign allocator.
7171
"""
7272

73+
external_usm_checkers = []
74+
75+
def add_external_usm_checker(func):
76+
ndarray.external_usm_checkers.append(func)
77+
7378
def __new__(
7479
subtype, shape, dtype=float, buffer=None, offset=0, strides=None, order=None
7580
):
7681
# Create a new array.
7782
if buffer is None:
78-
dprint("dparray::ndarray __new__ buffer None")
83+
dprint("numpy_usm_shared::ndarray __new__ buffer None")
7984
nelems = np.prod(shape)
8085
dt = np.dtype(dtype)
8186
isz = dt.itemsize
@@ -102,7 +107,7 @@ def __new__(
102107
return new_obj
103108
# zero copy if buffer is a usm backed array-like thing
104109
elif hasattr(buffer, array_interface_property):
105-
dprint("dparray::ndarray __new__ buffer", array_interface_property)
110+
dprint("numpy_usm_shared::ndarray __new__ buffer", array_interface_property)
106111
# also check for array interface
107112
new_obj = np.ndarray.__new__(
108113
subtype,
@@ -124,7 +129,7 @@ def __new__(
124129
)
125130
return new_obj
126131
else:
127-
dprint("dparray::ndarray __new__ buffer not None and not sycl_usm")
132+
dprint("numpy_usm_shared::ndarray __new__ buffer not None and not sycl_usm")
128133
nelems = np.prod(shape)
129134
# must copy
130135
ar = np.ndarray(
@@ -158,6 +163,9 @@ def __new__(
158163
)
159164
return new_obj
160165

166+
def __sycl_usm_array_interface__(self):
167+
return self._getter_sycl_usm_array_interface()
168+
161169
def _getter_sycl_usm_array_interface_(self):
162170
ary_iface = self.__array_interface__
163171
_base = _get_usm_base(self)
@@ -186,6 +194,9 @@ def __array_finalize__(self, obj):
186194
# subclass of ndarray, including our own.
187195
if hasattr(obj, array_interface_property):
188196
return
197+
for ext_checker in ndarray.external_usm_checkers:
198+
if ext_checker(obj):
199+
return
189200
if isinstance(obj, np.ndarray):
190201
ob = self
191202
while isinstance(ob, np.ndarray):
@@ -200,7 +211,7 @@ def __array_finalize__(self, obj):
200211
)
201212

202213
# Tell Numba to not treat this type just like a NumPy ndarray but to propagate its type.
203-
# This way it will use the custom dparray allocator.
214+
# This way it will use the custom numpy_usm_shared allocator.
204215
__numba_no_subtype_ndarray__ = True
205216

206217
# Convert to a NumPy ndarray.
@@ -234,8 +245,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
234245
else:
235246
return NotImplemented
236247
# Have to avoid recursive calls to array_ufunc here.
237-
# If no out kwarg then we create a dparray out so that we get
238-
# USM memory. However, if kwarg has dparray-typed out then
248+
# If no out kwarg then we create a numpy_usm_shared out so that we get
249+
# USM memory. However, if kwarg has numpy_usm_shared-typed out then
239250
# array_ufunc is called recursively so we cast out as regular
240251
# NumPy ndarray (having a USM data pointer).
241252
if kwargs.get("out", None) is None:
@@ -246,7 +257,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
246257
out_as_np = np.ndarray(out.shape, out.dtype, out)
247258
kwargs["out"] = out_as_np
248259
else:
249-
# If they manually gave dparray as out kwarg then we have to also
260+
# If they manually gave numpy_usm_shared as out kwarg then we have to also
250261
# cast as regular NumPy ndarray to avoid recursion.
251262
if isinstance(kwargs["out"], ndarray):
252263
out = kwargs["out"]
@@ -271,7 +282,7 @@ def isdef(x):
271282
cname = c[0]
272283
if isdef(cname):
273284
continue
274-
# For now we do the simple thing and copy the types from NumPy module into dparray module.
285+
# For now we do the simple thing and copy the types from NumPy module into numpy_usm_shared module.
275286
new_func = "%s = np.%s" % (cname, cname)
276287
try:
277288
the_code = compile(new_func, "__init__", "exec")
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Example "sycl_direct_linkage"
2+
3+
This Cython extension does not use dpCtl and links to SYCL directly.
4+
5+
It exposes `columnwise_total` function that uses oneMKL to compute
6+
totals for each column of its argument matrix in double precision,
7+
expected as an ordinary NumPy array in C-contiguous layout.
8+
9+
This functions performs the following steps:
10+
11+
1. Create a SYCL queue using default device selector
12+
2. Creates SYCL buffer around the matrix data
13+
3. Creates a vector `v_ones` with all elements being ones,
14+
and allocates memory for the result.
15+
4. Calls oneMKL to compute xGEMV, as dot(v_ones, M)
16+
5. Returs the result as NumPy array
17+
18+
This extension does not allow one to control the device/queue to
19+
which execution of kernel is being schedules.
20+
21+
A related example "sycl_buffer" modifies this example in that it uses
22+
`dpCtl` to retrieve the current queue, allowing a user control the queue,
23+
and the avoid the overhead of the queue creation.
24+
25+
To illustrate the queue creation overhead in each call, compare execution of default queue,
26+
which is Intel Gen9 GPU on OpenCL backend:
27+
28+
```
29+
(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_BE=PI_OPENCL python bench.py
30+
========== Executing warm-up ==========
31+
NumPy result: [1. 1. 1. ... 1. 1. 1.]
32+
SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.]
33+
Running time of 100 calls to columnwise_total on matrix with shape (10000, 4098)
34+
Times for default_selector, inclusive of queue creation:
35+
[19.384219504892826, 19.49932464491576, 19.613155928440392, 19.64031868893653, 19.752969074994326]
36+
Times for NumPy
37+
[3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887]
38+
```
39+
40+
vs. timing when `dpctl`'s current queue is being reused:
41+
42+
```
43+
(idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py
44+
========== Executing warm-up ==========
45+
NumPy result: [1. 1. 1. ... 1. 1. 1.]
46+
SYCL(Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz) result: [1. 1. 1. ... 1. 1. 1.]
47+
SYCL(Intel(R) Graphics Gen9 [0x9bca]) result: [1. 1. 1. ... 1. 1. 1.]
48+
Times for 'opencl:cpu:0'
49+
[2.9164800881408155, 2.8714500251226127, 2.9770236839540303, 2.913622073829174, 2.7949972581118345]
50+
Times for 'opencl:gpu:0'
51+
[9.529508924111724, 10.288004886358976, 10.189113245811313, 10.197128206957132, 10.26169267296791]
52+
Times for NumPy
53+
[3.4809365631081164, 3.42917942116037, 3.42471009073779, 3.3689011191017926, 3.4336009239777923]
54+
```
55+
56+
So the overhead of ``sycl::queue`` creation per call is roughly comparable with the time to
57+
execute the actual computation.
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import dpctl
2+
import syclbuffer_naive as sb
3+
import numpy as np
4+
5+
X = np.full((10 ** 4, 4098), 1e-4, dtype="d")
6+
7+
# warm-up
8+
print("=" * 10 + " Executing warm-up " + "=" * 10)
9+
print("NumPy result: ", X.sum(axis=0))
10+
11+
print(
12+
"SYCL(default_device) result: {}".format(
13+
sb.columnwise_total(X),
14+
)
15+
)
16+
17+
import timeit
18+
19+
print(
20+
"Running time of 100 calls to columnwise_total on matrix with shape {}".format(
21+
X.shape
22+
)
23+
)
24+
25+
print("Times for default_selector, inclusive of queue creation:")
26+
print(
27+
timeit.repeat(
28+
stmt="sb.columnwise_total(X)",
29+
setup="sb.columnwise_total(X)", # ensure JIT compilation is not counted
30+
number=100,
31+
globals=globals(),
32+
)
33+
)
34+
35+
print("Times for NumPy")
36+
print(timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals()))

examples/cython/sycl_direct_linkage/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import syclbuffer as sb
1+
import syclbuffer_naive as sb
22
import numpy as np
33

44
X = np.random.randn(20, 10)

examples/cython/sycl_direct_linkage/sycl_function.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#include <CL/sycl.hpp>
22
#include "sycl_function.hpp"
3-
#include "mkl_blas_sycl.hpp"
3+
#include <oneapi/mkl.hpp>
44
#include "mkl.h"
55

66
int c_columnwise_total(cl::sycl::queue &q, size_t n, size_t m, double *mat, double *ct) {

examples/cython/usm_memory/blackscholes.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# distutils: language=c++
33

44
cimport dpctl as c_dpctl
5-
cimport dpctl._memory as c_dpctl_mem
5+
cimport dpctl.memory as c_dpctl_mem
66
cimport numpy as cnp
77
from cython cimport floating
88

examples/cython/usm_memory/sycl_blackscholes.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#include <CL/sycl.hpp>
2+
#include <oneapi/mkl.hpp>
3+
#include <oneapi/mkl/rng/device.hpp>
24
#include "dpctl_sycl_types.h"
35
#include "sycl_blackscholes.hpp"
4-
#include "mkl_rng_sycl_device.hpp"
56

67
template<typename T>
78
class black_scholes_kernel;

0 commit comments

Comments
 (0)