Merge branch 'master' into gold/2021

PokhodenkoSA · PokhodenkoSA · commit d157ee0b1a9b · 2020-12-15T11:21:02.000-06:00
diff --git a/dpctl/dptensor/numpy_usm_shared.py b/dpctl/dptensor/numpy_usm_shared.py
@@ -1,4 +1,4 @@
-##===---------- dparray.py - dpctl  -------*- Python -*----===##
+##===---------- numpy_usm_shared.py - dpctl  -------*- Python -*----===##
 ##
 ##                      Data Parallel Control (dpCtl)
 ##
@@ -19,7 +19,7 @@
 ##===----------------------------------------------------------------------===##
 ###
 ### \file
-### This file implements a dparray - USM aware implementation of ndarray.
+### This file implements a numpy_usm_shared - USM aware implementation of ndarray.
 ##===----------------------------------------------------------------------===##
 
 import numpy as np
@@ -70,12 +70,17 @@ class ndarray(np.ndarray):
     with a foreign allocator.
     """
 
+    external_usm_checkers = []
+
+    def add_external_usm_checker(func):
+        ndarray.external_usm_checkers.append(func)
+
     def __new__(
         subtype, shape, dtype=float, buffer=None, offset=0, strides=None, order=None
     ):
         # Create a new array.
         if buffer is None:
-            dprint("dparray::ndarray __new__ buffer None")
+            dprint("numpy_usm_shared::ndarray __new__ buffer None")
             nelems = np.prod(shape)
             dt = np.dtype(dtype)
             isz = dt.itemsize
@@ -102,7 +107,7 @@ def __new__(
             return new_obj
         # zero copy if buffer is a usm backed array-like thing
         elif hasattr(buffer, array_interface_property):
-            dprint("dparray::ndarray __new__ buffer", array_interface_property)
+            dprint("numpy_usm_shared::ndarray __new__ buffer", array_interface_property)
             # also check for array interface
             new_obj = np.ndarray.__new__(
                 subtype,
@@ -124,7 +129,7 @@ def __new__(
                 )
             return new_obj
         else:
-            dprint("dparray::ndarray __new__ buffer not None and not sycl_usm")
+            dprint("numpy_usm_shared::ndarray __new__ buffer not None and not sycl_usm")
             nelems = np.prod(shape)
             # must copy
             ar = np.ndarray(
@@ -158,6 +163,9 @@ def __new__(
                 )
             return new_obj
 
+    def __sycl_usm_array_interface__(self):
+        return self._getter_sycl_usm_array_interface()
+
     def _getter_sycl_usm_array_interface_(self):
         ary_iface = self.__array_interface__
         _base = _get_usm_base(self)
@@ -186,6 +194,9 @@ def __array_finalize__(self, obj):
         # subclass of ndarray, including our own.
         if hasattr(obj, array_interface_property):
             return
+        for ext_checker in ndarray.external_usm_checkers:
+            if ext_checker(obj):
+                return
         if isinstance(obj, np.ndarray):
             ob = self
             while isinstance(ob, np.ndarray):
@@ -200,7 +211,7 @@ def __array_finalize__(self, obj):
         )
 
     # Tell Numba to not treat this type just like a NumPy ndarray but to propagate its type.
-    # This way it will use the custom dparray allocator.
+    # This way it will use the custom numpy_usm_shared allocator.
     __numba_no_subtype_ndarray__ = True
 
     # Convert to a NumPy ndarray.
@@ -234,8 +245,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                 else:
                     return NotImplemented
             # Have to avoid recursive calls to array_ufunc here.
-            # If no out kwarg then we create a dparray out so that we get
-            # USM memory.  However, if kwarg has dparray-typed out then
+            # If no out kwarg then we create a numpy_usm_shared out so that we get
+            # USM memory.  However, if kwarg has numpy_usm_shared-typed out then
             # array_ufunc is called recursively so we cast out as regular
             # NumPy ndarray (having a USM data pointer).
             if kwargs.get("out", None) is None:
@@ -246,7 +257,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                 out_as_np = np.ndarray(out.shape, out.dtype, out)
                 kwargs["out"] = out_as_np
             else:
-                # If they manually gave dparray as out kwarg then we have to also
+                # If they manually gave numpy_usm_shared as out kwarg then we have to also
                 # cast as regular NumPy ndarray to avoid recursion.
                 if isinstance(kwargs["out"], ndarray):
                     out = kwargs["out"]
@@ -271,7 +282,7 @@ def isdef(x):
     cname = c[0]
     if isdef(cname):
         continue
-    # For now we do the simple thing and copy the types from NumPy module into dparray module.
+    # For now we do the simple thing and copy the types from NumPy module into numpy_usm_shared module.
     new_func = "%s = np.%s" % (cname, cname)
     try:
         the_code = compile(new_func, "__init__", "exec")
diff --git a/examples/cython/sycl_direct_linkage/README.md b/examples/cython/sycl_direct_linkage/README.md
@@ -0,0 +1,57 @@
+# Example "sycl_direct_linkage"
+
+This Cython extension does not use dpCtl and links to SYCL directly.
+
+It exposes `columnwise_total` function that uses oneMKL to compute
+totals for each column of its argument matrix in double precision,
+expected as an ordinary NumPy array in C-contiguous layout.
+
+This functions performs the following steps:
+
+  1. Create a SYCL queue using default device selector
+  2. Creates SYCL buffer around the matrix data
+  3. Creates a vector `v_ones` with all elements being ones,
+     and allocates memory for the result.
+  4. Calls oneMKL to compute xGEMV, as dot(v_ones, M)
+  5. Returs the result as NumPy array
+
+This extension does not allow one to control the device/queue to
+which execution of kernel is being schedules.
+
+A related example "sycl_buffer" modifies this example in that it uses
+`dpCtl` to retrieve the current queue, allowing a user control the queue,
+and the avoid the overhead of the queue creation.
+
+To illustrate the queue creation overhead in each call, compare execution of default queue,
+which is Intel Gen9 GPU on OpenCL backend:
+
+```
+(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_BE=PI_OPENCL python bench.py
+========== Executing warm-up ==========
+NumPy result:  [1. 1. 1. ... 1. 1. 1.]
+SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.]
+Running time of 100 calls to columnwise_total on matrix with shape (10000, 4098)
+Times for default_selector, inclusive of queue creation:
+[19.384219504892826, 19.49932464491576, 19.613155928440392, 19.64031868893653, 19.752969074994326]
+Times for NumPy
+[3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887]
+```
+
+vs. timing when `dpctl`'s current queue is being reused:
+
+```
+(idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py
+========== Executing warm-up ==========
+NumPy result:  [1. 1. 1. ... 1. 1. 1.]
+SYCL(Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz) result: [1. 1. 1. ... 1. 1. 1.]
+SYCL(Intel(R) Graphics Gen9 [0x9bca]) result: [1. 1. 1. ... 1. 1. 1.]
+Times for 'opencl:cpu:0'
+[2.9164800881408155, 2.8714500251226127, 2.9770236839540303, 2.913622073829174, 2.7949972581118345]
+Times for 'opencl:gpu:0'
+[9.529508924111724, 10.288004886358976, 10.189113245811313, 10.197128206957132, 10.26169267296791]
+Times for NumPy
+[3.4809365631081164, 3.42917942116037, 3.42471009073779, 3.3689011191017926, 3.4336009239777923]
+```
+
+So the overhead of ``sycl::queue`` creation per call is roughly comparable with the time to
+execute the actual computation.
diff --git a/examples/cython/sycl_direct_linkage/bench.py b/examples/cython/sycl_direct_linkage/bench.py
@@ -0,0 +1,36 @@
+import dpctl
+import syclbuffer_naive as sb
+import numpy as np
+
+X = np.full((10 ** 4, 4098), 1e-4, dtype="d")
+
+# warm-up
+print("=" * 10 + " Executing warm-up " + "=" * 10)
+print("NumPy result: ", X.sum(axis=0))
+
+print(
+    "SYCL(default_device) result: {}".format(
+        sb.columnwise_total(X),
+    )
+)
+
+import timeit
+
+print(
+    "Running time of 100 calls to columnwise_total on matrix with shape {}".format(
+        X.shape
+    )
+)
+
+print("Times for default_selector, inclusive of queue creation:")
+print(
+    timeit.repeat(
+        stmt="sb.columnwise_total(X)",
+        setup="sb.columnwise_total(X)",  # ensure JIT compilation is not counted
+        number=100,
+        globals=globals(),
+    )
+)
+
+print("Times for NumPy")
+print(timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals()))
diff --git a/examples/cython/sycl_direct_linkage/run.py b/examples/cython/sycl_direct_linkage/run.py
@@ -1,4 +1,4 @@
-import syclbuffer as sb
+import syclbuffer_naive as sb
 import numpy as np
 
 X = np.random.randn(20, 10)
diff --git a/examples/cython/sycl_direct_linkage/sycl_function.cpp b/examples/cython/sycl_direct_linkage/sycl_function.cpp
@@ -1,6 +1,6 @@
 #include <CL/sycl.hpp>
 #include "sycl_function.hpp"
-#include "mkl_blas_sycl.hpp"
+#include <oneapi/mkl.hpp>
 #include "mkl.h"
 
 int c_columnwise_total(cl::sycl::queue &q, size_t n, size_t m, double *mat, double *ct) {
diff --git a/examples/cython/usm_memory/blackscholes.pyx b/examples/cython/usm_memory/blackscholes.pyx
@@ -2,7 +2,7 @@
 # distutils: language=c++
 
 cimport dpctl as c_dpctl
-cimport dpctl._memory as c_dpctl_mem
+cimport dpctl.memory as c_dpctl_mem
 cimport numpy as cnp
 from cython cimport floating
 
diff --git a/examples/cython/usm_memory/sycl_blackscholes.cpp b/examples/cython/usm_memory/sycl_blackscholes.cpp
@@ -1,7 +1,8 @@
 #include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include <oneapi/mkl/rng/device.hpp>
 #include "dpctl_sycl_types.h"
 #include "sycl_blackscholes.hpp"
-#include "mkl_rng_sycl_device.hpp"
 
 template<typename T>
 class black_scholes_kernel;

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import syclbuffer as sb`
	`1`	`+import syclbuffer_naive as sb`
`2`	`2`	`import numpy as np`
`3`	`3`
`4`	`4`	`X = np.random.randn(20, 10)`