Skip to content

Commit abcfc5d

Browse files
Merge pull request #821 from IntelPython/chebyshev_example
onemkl_gemv example expanded: iterative linear solver example working
2 parents 22d231c + fc7041e commit abcfc5d

File tree

24 files changed

+1645
-52
lines changed

24 files changed

+1645
-52
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
cmake_minimum_required(VERSION 3.21)
2+
3+
project(external_usm_allocation LANGUAGES CXX)
4+
5+
set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
6+
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
7+
find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
8+
9+
set(CMAKE_CXX_STANDARD 17)
10+
set(CMAKE_CXX_STANDARD_REQUIRED True)
11+
12+
# Fetch pybind11
13+
include(FetchContent)
14+
FetchContent_Declare(
15+
pybind11
16+
URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz
17+
URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1
18+
)
19+
FetchContent_MakeAvailable(pybind11)
20+
21+
find_package(PythonExtensions REQUIRED)
22+
find_package(Dpctl REQUIRED)
23+
find_package(NumPy REQUIRED)
24+
25+
set(py_module_name _external_usm_alloc)
26+
pybind11_add_module(${py_module_name}
27+
MODULE
28+
external_usm_allocation/_usm_alloc_example.cpp
29+
)
30+
target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
31+
target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations)
32+
install(TARGETS ${py_module_name}
33+
DESTINATION external_usm_allocation
34+
)
35+
36+
set(ignoreMe "${SKBUILD}")

examples/pybind11/external_usm_allocation/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@ to dpctl.memory entities using `__sycl_usm_array_interface__`.
1010
```
1111
source /opt/intel/oneapi/compiler/latest/env/vars.sh
1212
CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace
13+
python -m pytest tests
1314
python example.py
1415
```
1516

1617
# Sample output
1718

1819
```
1920
(idp) [12:43:20 ansatnuc04 external_usm_allocation]$ python example.py
20-
<external_usm_alloc.DMatrix object at 0x7f2b98b4cef0>
21+
<external_usm_allocation._external_usm_alloc.DMatrix object at 0x7f2b98b4cef0>
2122
{'data': [94846745444352, True], 'shape': (5, 5), 'strides': None, 'version': 1, 'typestr': '|f8', 'syclobj': <capsule object "SyclQueueRef" at 0x7f2b9b941d80>}
2223
shared
2324

examples/pybind11/external_usm_allocation/example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
# coding: utf-8
1818

19-
import external_usm_alloc as eua
19+
import external_usm_allocation as eua
2020
import numpy as np
2121

2222
import dpctl
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Data Parallel Control (dpctl)
2+
#
3+
# Copyright 2020-2021 Intel Corporation
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# coding: utf-8
18+
19+
from ._external_usm_alloc import DMatrix
20+
21+
__all__ = ["DMatrix"]
22+
23+
__doc__ = """
24+
Example of implementing C++ class with its own USM memory allocation logic
25+
and interfacing that allocation with `dpctl` by implementing
26+
`__sycl_usm_array_interface__`.
27+
"""

examples/pybind11/external_usm_allocation/_usm_alloc_example.cpp renamed to examples/pybind11/external_usm_allocation/external_usm_allocation/_usm_alloc_example.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ py::list tolist(DMatrix &m)
132132
return rows;
133133
}
134134

135-
PYBIND11_MODULE(external_usm_alloc, m)
135+
PYBIND11_MODULE(_external_usm_alloc, m)
136136
{
137137
// Import the dpctl extensions
138138
import_dpctl();

examples/pybind11/external_usm_allocation/setup.py

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,13 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
from pybind11.setup_helpers import Pybind11Extension
18-
from setuptools import setup
17+
from skbuild import setup
1918

20-
import dpctl
21-
22-
ext_modules = [
23-
Pybind11Extension(
24-
"external_usm_alloc",
25-
["./_usm_alloc_example.cpp"],
26-
include_dirs=[dpctl.get_include()],
27-
extra_compile_args=["-fPIC"],
28-
extra_link_args=["-fPIC"],
29-
libraries=["sycl"],
30-
language="c++",
31-
)
32-
]
33-
34-
setup(name="external_usm_alloc", ext_modules=ext_modules)
19+
setup(
20+
name="external_usm_allocation",
21+
version="0.0.1",
22+
description="an example of SYCL-powered Python package (with pybind11)",
23+
author="Intel Scripting",
24+
license="Apache 2.0",
25+
packages=["external_usm_allocation"],
26+
)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Data Parallel Control (dpctl)
2+
#
3+
# Copyright 2020-2021 Intel Corporation
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# coding: utf-8
18+
19+
import external_usm_allocation as eua
20+
import numpy as np
21+
22+
import dpctl
23+
import dpctl.memory as dpm
24+
25+
26+
def test_dmatrix():
27+
q = dpctl.SyclQueue()
28+
matr = eua.DMatrix(q, 5, 5)
29+
assert hasattr(matr, "__sycl_usm_array_interface__")
30+
31+
blob = dpm.as_usm_memory(matr)
32+
assert blob.get_usm_type() == "shared"
33+
34+
Xh = np.array(
35+
[
36+
[1, 1, 1, 2, 2],
37+
[1, 0, 1, 2, 2],
38+
[1, 1, 0, 2, 2],
39+
[0, 0, 0, 3, -1],
40+
[0, 0, 0, -1, 5],
41+
],
42+
dtype="d",
43+
)
44+
host_bytes_view = Xh.reshape((-1)).view(np.ubyte)
45+
blob.copy_from_host(host_bytes_view)
46+
47+
list_of_lists = matr.tolist()
48+
assert list_of_lists == Xh.tolist()

examples/pybind11/onemkl_gemv/CMakeLists.txt

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ pybind11_add_module(${py_module_name}
3838
sycl_gemm/_onemkl.cpp
3939
)
4040
target_include_directories(${py_module_name}
41-
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR}
41+
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
4242
)
4343
target_link_libraries(${py_module_name}
4444
PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
@@ -53,4 +53,18 @@ set_source_files_properties(${_sycl_gemm_sources}
5353
COMPILE_OPTIONS "-O3;-Wno-deprecated-declarations"
5454
)
5555

56+
add_executable(standalone_cpp
57+
EXCLUDE_FROM_ALL
58+
cpp/main.cpp
59+
)
60+
target_compile_options(standalone_cpp
61+
PRIVATE -O3 -Wno-deprecated-declarations
62+
)
63+
target_include_directories(standalone_cpp
64+
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
65+
)
66+
target_link_libraries(standalone_cpp
67+
PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
68+
)
69+
5670
set(ignoreMe "${SKBUILD}")
Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,32 @@
11
Example of SYCL built pybind11 extension
22

3-
To build, use (assumes scikit-build and dpcpp) is installed
3+
To build, use (assumes scikit-build and dpcpp is installed):
44

55
```sh
6-
python setup.py develop -- -G "Ninja" -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
6+
python setup.py develop -- -G "Ninja" \
7+
-DCMAKE_C_COMPILER:PATH=icx \
8+
-DCMAKE_CXX_COMPILER:PATH=icpx \
9+
-DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib \
10+
-DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib \
11+
-DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include \
12+
-DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
713
```
814

915
To run test suite
1016

1117
```sh
1218
python -m pytest tests
1319
```
20+
21+
To compare Python overhead,
22+
23+
```
24+
# build standad-alone executable
25+
cmake --build $(find . -name cmake-build) --target standalone_cpp
26+
# execute it
27+
$(find . -name cmake-build)/standalone_cpp 1000 11
28+
# launch Python computatin
29+
python sycl_timing_solver.py 1000 11
30+
```
31+
32+
Compare host times vs. C++ wall-clock times while making sure that the number of iterations is the same
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#include "cg_solver.hpp"
2+
#include <CL/sycl.hpp>
3+
#include <chrono>
4+
#include <iostream>
5+
#include <oneapi/mkl.hpp>
6+
7+
using T = double;
8+
9+
int main(int argc, char *argv[])
10+
{
11+
size_t n = 1000;
12+
size_t rank = 16;
13+
14+
if (argc > 1) {
15+
n = std::stoi(argv[1]);
16+
}
17+
18+
if (argc > 2) {
19+
rank = std::stoi(argv[2]);
20+
}
21+
22+
std::cout << "Solving " << n << " by " << n << " diagonal system with rank-"
23+
<< rank << " perturbation." << std::endl;
24+
25+
sycl::queue q;
26+
27+
// USM allocation for data needed by program
28+
size_t buf_size = n * n + rank * n + 2 * n;
29+
T *buf = sycl::malloc_device<T>(buf_size, q);
30+
sycl::event memset_ev = q.fill<T>(buf, T(0), buf_size);
31+
32+
T *Amat = buf;
33+
T *umat = buf + n * n;
34+
T *bvec = umat + rank * n;
35+
T *sol_vec = bvec + n;
36+
37+
sycl::event set_diag_ev = q.submit([&](sycl::handler &cgh) {
38+
cgh.depends_on({memset_ev});
39+
cgh.parallel_for({n}, [=](sycl::id<1> id) {
40+
auto i = id[0];
41+
Amat[i * (n + 1)] = T(1);
42+
});
43+
});
44+
45+
oneapi::mkl::rng::philox4x32x10 engine(q, 7777);
46+
oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>
47+
distr(0.0, 1.0);
48+
49+
// populate umat and bvec in one call
50+
sycl::event umat_rand_ev =
51+
oneapi::mkl::rng::generate(distr, engine, n * rank + n, umat);
52+
53+
sycl::event syrk_ev = oneapi::mkl::blas::row_major::syrk(
54+
q, oneapi::mkl::uplo::U, oneapi::mkl::transpose::N, n, rank, T(1), umat,
55+
rank, T(1), Amat, n, {umat_rand_ev, set_diag_ev});
56+
57+
// need to transpose
58+
sycl::event transpose_ev = q.submit([&](sycl::handler &cgh) {
59+
cgh.depends_on(syrk_ev);
60+
cgh.parallel_for({n * n}, [=](sycl::id<1> id) {
61+
size_t i = id[0];
62+
size_t i0 = i / n;
63+
size_t i1 = i - i0 * n;
64+
if (i0 > i1) {
65+
Amat[i] = Amat[i1 * n + i0];
66+
}
67+
});
68+
});
69+
70+
q.wait();
71+
72+
constexpr int reps = 6;
73+
74+
std::vector<double> time;
75+
std::vector<int> conv_iters;
76+
77+
time.reserve(reps);
78+
conv_iters.reserve(reps);
79+
for (int i = 0; i < reps; ++i) {
80+
auto start = std::chrono::high_resolution_clock::now();
81+
int conv_iter_count = cg_solver::cg_solve(q, n, Amat, bvec, sol_vec);
82+
auto end = std::chrono::high_resolution_clock::now();
83+
84+
time.push_back(
85+
std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
86+
.count() *
87+
1e-06);
88+
89+
conv_iters.push_back(conv_iter_count);
90+
}
91+
92+
std::cout << "Converged in : ";
93+
for (auto &el : conv_iters) {
94+
std::cout << el << " , ";
95+
}
96+
std::cout << std::endl;
97+
98+
std::cout << "Wall-clock cg_solve execution times: ";
99+
for (auto &el : time) {
100+
std::cout << el << " , ";
101+
}
102+
std::cout << std::endl;
103+
104+
T *Ax = sycl::malloc_device<T>(2 * n + 1, q);
105+
T *delta = Ax + n;
106+
107+
sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
108+
q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, sol_vec, 1, T(0), Ax,
109+
1);
110+
111+
sycl::event sub_ev = oneapi::mkl::vm::sub(q, n, Ax, bvec, delta, {gemv_ev},
112+
oneapi::mkl::vm::mode::ha);
113+
114+
T *n2 = delta + n;
115+
sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
116+
q, n, delta, 1, delta, 1, n2, {sub_ev});
117+
118+
T n2_host{};
119+
q.copy<T>(n2, &n2_host, 1, {dot_ev}).wait_and_throw();
120+
121+
std::cout << "Redisual norm squared: " << n2_host << std::endl;
122+
123+
q.wait_and_throw();
124+
sycl::free(Ax, q);
125+
sycl::free(buf, q);
126+
127+
return 0;
128+
}

examples/pybind11/onemkl_gemv/setup.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Data Parallel Control (dpctl)
2+
#
3+
# Copyright 2020-2021 Intel Corporation
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
117
from skbuild import setup
218

319
setup(

0 commit comments

Comments
 (0)