Skip to content

onemkl_gemv example expanded: iterative linear solver example working #821

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Apr 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions examples/pybind11/external_usm_allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
cmake_minimum_required(VERSION 3.21)

project(external_usm_allocation LANGUAGES CXX)

set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)

# Fetch pybind11
include(FetchContent)
FetchContent_Declare(
pybind11
URL https://github.com/pybind/pybind11/archive/refs/tags/v2.9.2.tar.gz
URL_HASH SHA256=6bd528c4dbe2276635dc787b6b1f2e5316cf6b49ee3e150264e455a0d68d19c1
)
FetchContent_MakeAvailable(pybind11)

find_package(PythonExtensions REQUIRED)
find_package(Dpctl REQUIRED)
find_package(NumPy REQUIRED)

set(py_module_name _external_usm_alloc)
pybind11_add_module(${py_module_name}
MODULE
external_usm_allocation/_usm_alloc_example.cpp
)
target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
target_compile_options(${py_module_name} PRIVATE -Wno-deprecated-declarations)
install(TARGETS ${py_module_name}
DESTINATION external_usm_allocation
)

set(ignoreMe "${SKBUILD}")
3 changes: 2 additions & 1 deletion examples/pybind11/external_usm_allocation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ to dpctl.memory entities using `__sycl_usm_array_interface__`.
```
source /opt/intel/oneapi/compiler/latest/env/vars.sh
CXX=dpcpp CC=dpcpp python setup.py build_ext --inplace
python -m pytest tests
python example.py
```

# Sample output

```
(idp) [12:43:20 ansatnuc04 external_usm_allocation]$ python example.py
<external_usm_alloc.DMatrix object at 0x7f2b98b4cef0>
<external_usm_allocation._external_usm_alloc.DMatrix object at 0x7f2b98b4cef0>
{'data': [94846745444352, True], 'shape': (5, 5), 'strides': None, 'version': 1, 'typestr': '|f8', 'syclobj': <capsule object "SyclQueueRef" at 0x7f2b9b941d80>}
shared

Expand Down
2 changes: 1 addition & 1 deletion examples/pybind11/external_usm_allocation/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# coding: utf-8

import external_usm_alloc as eua
import external_usm_allocation as eua
import numpy as np

import dpctl
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Data Parallel Control (dpctl)
#
# Copyright 2020-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# coding: utf-8

from ._external_usm_alloc import DMatrix

__all__ = ["DMatrix"]

__doc__ = """
Example of implementing C++ class with its own USM memory allocation logic
and interfacing that allocation with `dpctl` by implementing
`__sycl_usm_array_interface__`.
"""
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ py::list tolist(DMatrix &m)
return rows;
}

PYBIND11_MODULE(external_usm_alloc, m)
PYBIND11_MODULE(_external_usm_alloc, m)
{
// Import the dpctl extensions
import_dpctl();
Expand Down
26 changes: 9 additions & 17 deletions examples/pybind11/external_usm_allocation/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from pybind11.setup_helpers import Pybind11Extension
from setuptools import setup
from skbuild import setup

import dpctl

ext_modules = [
Pybind11Extension(
"external_usm_alloc",
["./_usm_alloc_example.cpp"],
include_dirs=[dpctl.get_include()],
extra_compile_args=["-fPIC"],
extra_link_args=["-fPIC"],
libraries=["sycl"],
language="c++",
)
]

setup(name="external_usm_alloc", ext_modules=ext_modules)
setup(
name="external_usm_allocation",
version="0.0.1",
description="an example of SYCL-powered Python package (with pybind11)",
author="Intel Scripting",
license="Apache 2.0",
packages=["external_usm_allocation"],
)
48 changes: 48 additions & 0 deletions examples/pybind11/external_usm_allocation/tests/test_dmatrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Data Parallel Control (dpctl)
#
# Copyright 2020-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# coding: utf-8

import external_usm_allocation as eua
import numpy as np

import dpctl
import dpctl.memory as dpm


def test_dmatrix():
q = dpctl.SyclQueue()
matr = eua.DMatrix(q, 5, 5)
assert hasattr(matr, "__sycl_usm_array_interface__")

blob = dpm.as_usm_memory(matr)
assert blob.get_usm_type() == "shared"

Xh = np.array(
[
[1, 1, 1, 2, 2],
[1, 0, 1, 2, 2],
[1, 1, 0, 2, 2],
[0, 0, 0, 3, -1],
[0, 0, 0, -1, 5],
],
dtype="d",
)
host_bytes_view = Xh.reshape((-1)).view(np.ubyte)
blob.copy_from_host(host_bytes_view)

list_of_lists = matr.tolist()
assert list_of_lists == Xh.tolist()
16 changes: 15 additions & 1 deletion examples/pybind11/onemkl_gemv/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ pybind11_add_module(${py_module_name}
sycl_gemm/_onemkl.cpp
)
target_include_directories(${py_module_name}
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR}
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
)
target_link_libraries(${py_module_name}
PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
Expand All @@ -53,4 +53,18 @@ set_source_files_properties(${_sycl_gemm_sources}
COMPILE_OPTIONS "-O3;-Wno-deprecated-declarations"
)

add_executable(standalone_cpp
EXCLUDE_FROM_ALL
cpp/main.cpp
)
target_compile_options(standalone_cpp
PRIVATE -O3 -Wno-deprecated-declarations
)
target_include_directories(standalone_cpp
PUBLIC ${MKL_INCLUDE_DIR} ${TBB_INCLUDE_DIR} sycl_gemm
)
target_link_libraries(standalone_cpp
PRIVATE ${mkl_sycl} ${mkl_intel_ilp64} ${mkl_tbb_thread} ${mkl_core} ${tbb}
)

set(ignoreMe "${SKBUILD}")
23 changes: 21 additions & 2 deletions examples/pybind11/onemkl_gemv/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,32 @@
Example of SYCL built pybind11 extension

To build, use (assumes scikit-build and dpcpp) is installed
To build, use (assumes scikit-build and dpcpp is installed):

```sh
python setup.py develop -- -G "Ninja" -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icpx -DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib -DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib -DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include -DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
python setup.py develop -- -G "Ninja" \
-DCMAKE_C_COMPILER:PATH=icx \
-DCMAKE_CXX_COMPILER:PATH=icpx \
-DTBB_LIBRARY_DIR=$CONDA_PREFIX/lib \
-DMKL_LIBRARY_DIR=${CONDA_PREFIX}/lib \
-DMKL_INCLUDE_DIR=${CONDA_PREFIX}/include \
-DTBB_INCLUDE_DIR=${CONDA_PREFIX}/include
```

To run test suite

```sh
python -m pytest tests
```

To compare Python overhead,

```
# build standad-alone executable
cmake --build $(find . -name cmake-build) --target standalone_cpp
# execute it
$(find . -name cmake-build)/standalone_cpp 1000 11
# launch Python computatin
python sycl_timing_solver.py 1000 11
```

Compare host times vs. C++ wall-clock times while making sure that the number of iterations is the same
128 changes: 128 additions & 0 deletions examples/pybind11/onemkl_gemv/cpp/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#include "cg_solver.hpp"
#include <CL/sycl.hpp>
#include <chrono>
#include <iostream>
#include <oneapi/mkl.hpp>

using T = double;

int main(int argc, char *argv[])
{
size_t n = 1000;
size_t rank = 16;

if (argc > 1) {
n = std::stoi(argv[1]);
}

if (argc > 2) {
rank = std::stoi(argv[2]);
}

std::cout << "Solving " << n << " by " << n << " diagonal system with rank-"
<< rank << " perturbation." << std::endl;

sycl::queue q;

// USM allocation for data needed by program
size_t buf_size = n * n + rank * n + 2 * n;
T *buf = sycl::malloc_device<T>(buf_size, q);
sycl::event memset_ev = q.fill<T>(buf, T(0), buf_size);

T *Amat = buf;
T *umat = buf + n * n;
T *bvec = umat + rank * n;
T *sol_vec = bvec + n;

sycl::event set_diag_ev = q.submit([&](sycl::handler &cgh) {
cgh.depends_on({memset_ev});
cgh.parallel_for({n}, [=](sycl::id<1> id) {
auto i = id[0];
Amat[i * (n + 1)] = T(1);
});
});

oneapi::mkl::rng::philox4x32x10 engine(q, 7777);
oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>
distr(0.0, 1.0);

// populate umat and bvec in one call
sycl::event umat_rand_ev =
oneapi::mkl::rng::generate(distr, engine, n * rank + n, umat);

sycl::event syrk_ev = oneapi::mkl::blas::row_major::syrk(
q, oneapi::mkl::uplo::U, oneapi::mkl::transpose::N, n, rank, T(1), umat,
rank, T(1), Amat, n, {umat_rand_ev, set_diag_ev});

// need to transpose
sycl::event transpose_ev = q.submit([&](sycl::handler &cgh) {
cgh.depends_on(syrk_ev);
cgh.parallel_for({n * n}, [=](sycl::id<1> id) {
size_t i = id[0];
size_t i0 = i / n;
size_t i1 = i - i0 * n;
if (i0 > i1) {
Amat[i] = Amat[i1 * n + i0];
}
});
});

q.wait();

constexpr int reps = 6;

std::vector<double> time;
std::vector<int> conv_iters;

time.reserve(reps);
conv_iters.reserve(reps);
for (int i = 0; i < reps; ++i) {
auto start = std::chrono::high_resolution_clock::now();
int conv_iter_count = cg_solver::cg_solve(q, n, Amat, bvec, sol_vec);
auto end = std::chrono::high_resolution_clock::now();

time.push_back(
std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
.count() *
1e-06);

conv_iters.push_back(conv_iter_count);
}

std::cout << "Converged in : ";
for (auto &el : conv_iters) {
std::cout << el << " , ";
}
std::cout << std::endl;

std::cout << "Wall-clock cg_solve execution times: ";
for (auto &el : time) {
std::cout << el << " , ";
}
std::cout << std::endl;

T *Ax = sycl::malloc_device<T>(2 * n + 1, q);
T *delta = Ax + n;

sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
q, oneapi::mkl::transpose::N, n, n, T(1), Amat, n, sol_vec, 1, T(0), Ax,
1);

sycl::event sub_ev = oneapi::mkl::vm::sub(q, n, Ax, bvec, delta, {gemv_ev},
oneapi::mkl::vm::mode::ha);

T *n2 = delta + n;
sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
q, n, delta, 1, delta, 1, n2, {sub_ev});

T n2_host{};
q.copy<T>(n2, &n2_host, 1, {dot_ev}).wait_and_throw();

std::cout << "Redisual norm squared: " << n2_host << std::endl;

q.wait_and_throw();
sycl::free(Ax, q);
sycl::free(buf, q);

return 0;
}
16 changes: 16 additions & 0 deletions examples/pybind11/onemkl_gemv/setup.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# Data Parallel Control (dpctl)
#
# Copyright 2020-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from skbuild import setup

setup(
Expand Down
Loading