Streamlined Chebyshev solver

oleksandr-pavlyk · oleksandr-pavlyk · commit fc7041eaed2f · 2022-04-27T12:08:54.000-05:00
Expose cpp_cg_solve used in standalone_cpp executable from Python.
Invoked that from Python script sycl_timing_solver.py

```bash
$ python sycl_timing_solver.py 1000 11
Solving 1000 by 1000 diagonal linear system with rank 11 perturbation.
    Name            Intel(R) UHD Graphics [0x9bca]
    Driver version  1.3.22992
    Vendor          Intel(R) Corporation
    Profile         FULL_PROFILE
    Filter string   level_zero:gpu:0

Using not in-order queue
0 (host_dt, device_dt)= (1157.4030127376318, 403.9605020000001)
1 (host_dt, device_dt)= (421.32044583559036, 403.45619400000004)
2 (host_dt, device_dt)= (420.66121101379395, 402.57058400000005)
3 (host_dt, device_dt)= (421.5433243662119, 402.9254920000001)
4 (host_dt, device_dt)= (421.9988752156496, 402.8818340000001)
5 (host_dt, device_dt)= (422.3589450120926, 402.63814600000006)
Converged in:  [11, 11, 11, 11, 11, 11]
Python solution residual norm squared: 3.2839902926527995e-25
0 (host_dt, device_dt)= (412.9443597048521, 403.6290000000001)
1 (host_dt, device_dt)= (413.7023724615574, 403.8434720000001)
2 (host_dt, device_dt)= (413.4188834577799, 403.1985620000001)
3 (host_dt, device_dt)= (413.85203413665295, 402.70404800000006)
4 (host_dt, device_dt)= (416.2806496024132, 404.0513040000001)
5 (host_dt, device_dt)= (417.43320040404797, 404.74999800000006)
Converged in:  [11, 11, 11, 11, 11, 11]
cpp_cg_solve solution residual norm squared: 3.218393087932091e-25
```
diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
@@ -38,44 +38,44 @@ def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
     p = empty_like(Ax)
 
     e_x = dpctl.SyclEvent()
-    he_dot, e_dot = sycl_gemm.gemv(
-        exec_queue, A, x, Ax, depends=depends
-    )  # Ax = A @ x
-    he_sub, e_sub = sycl_gemm.sub(
-        exec_queue, b, Ax, r, depends=[e_dot]
-    )  # r = b - Ax
+    # Ax = A @ x
+    _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=depends)
+    # r = b - Ax
+    _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot])
     r_ev = e_sub
     for i in range(nIters):
         z = r
         z_ev = r_ev
         if i == 0:
             p[:] = z
             alpha = 1 / d
-            he_axpby, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent()
+            _, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent()
         elif i == 1:
             beta = 0.5 * (c * alpha) ** 2
             alpha = 1 / (d - beta / alpha)
-            he_axpby, e_axpby = sycl_gemm.axpby_inplace(
+            # p = z + beta * p
+            _, e_axpby = sycl_gemm.axpby_inplace(
                 exec_queue, 1, z, beta, p, depends=[z_ev]
-            )  # p = z + beta * p
+            )
         else:
             beta = (c / 2 * alpha) ** 2
             alpha = 1 / (d - beta / alpha)
-            he_axpby, e_axpby = sycl_gemm.axpby_inplace(
+            # p = z + beta * p
+            _, e_axpby = sycl_gemm.axpby_inplace(
                 exec_queue, 1, z, beta, p, depends=[z_ev]
-            )  # p = z + beta * p
-        h_x, e_x = sycl_gemm.axpby_inplace(
+            )
+        # x = x + alpha * p
+        _, e_x = sycl_gemm.axpby_inplace(
             exec_queue, alpha, p, 1, x, depends=[e_axpby, e_x]
-        )  # x = x + alpha * p
-        he_dot, e_dot = sycl_gemm.gemv(
-            exec_queue, A, x, Ax, depends=[e_x]
-        )  # Ax = A @ x
-        he_sub, e_sub = sycl_gemm.sub(
-            exec_queue, b, Ax, r, depends=[e_dot]
-        )  # r = b - Ax
+        )
+        # Ax = A @ x
+        _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=[e_x])
+        # r = b - Ax
+        _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot])
+        # residual = dot(r, r)
         residual = sycl_gemm.norm_squared_blocking(
             exec_queue, r, depends=[e_sub]
-        )  # residual = dot(r, r)
+        )
         if residual <= 1e-29:
             print(f"chebyshev: converged in {i} iters")
             break
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py b/examples/pybind11/onemkl_gemv/sycl_gemm/__init__.py
@@ -16,6 +16,7 @@
 
 from ._onemkl import (
     axpby_inplace,
+    cpp_cg_solve,
     dot_blocking,
     gemv,
     norm_squared_blocking,
@@ -28,4 +29,5 @@
     "axpby_inplace",
     "norm_squared_blocking",
     "dot_blocking",
+    "cpp_cg_solve",
 ]
diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -62,6 +62,15 @@ py_gemv(sycl::queue q,
         throw std::runtime_error("Inconsistent shapes.");
     }
 
+    auto q_ctx = q.get_context();
+    if (q_ctx != matrix.get_queue().get_context() ||
+        q_ctx != vector.get_queue().get_context() ||
+        q_ctx != result.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue.");
+    }
+
     int mat_flags = matrix.get_flags();
     int v_flags = vector.get_flags();
     int r_flags = result.get_flags();
@@ -176,6 +185,14 @@ py_sub(sycl::queue q,
         throw std::runtime_error("Vectors must have the same length");
     }
 
+    if (q.get_context() != in_v1.get_queue().get_context() ||
+        q.get_context() != in_v2.get_queue().get_context() ||
+        q.get_context() != out_r.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
     int in_v1_flags = in_v1.get_flags();
     int in_v2_flags = in_v2.get_flags();
     int out_r_flags = out_r.get_flags();
@@ -277,6 +294,13 @@ py_axpby_inplace(sycl::queue q,
         throw std::runtime_error("Vectors must have the same length");
     }
 
+    if (q.get_context() != x.get_queue().get_context() ||
+        q.get_context() != y.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
     int x_flags = x.get_flags();
     int y_flags = y.get_flags();
 
@@ -373,6 +397,11 @@ py::object py_norm_squared_blocking(sycl::queue q,
         throw std::runtime_error("Vector must be contiguous.");
     }
 
+    if (q.get_context() != r.get_queue().get_context()) {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
     int r_typenum = r.get_typenum();
     if ((r_typenum != UAR_DOUBLE) && (r_typenum != UAR_FLOAT) &&
         (r_typenum != UAR_CDOUBLE) && (r_typenum != UAR_CFLOAT))
@@ -437,6 +466,13 @@ py::object py_dot_blocking(sycl::queue q,
         throw std::runtime_error("Vectors must be contiguous.");
     }
 
+    if (q.get_context() != v1.get_queue().get_context() ||
+        q.get_context() != v2.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocation is not bound to the context in execution queue");
+    }
+
     int v1_typenum = v1.get_typenum();
     int v2_typenum = v2.get_typenum();
 
@@ -500,6 +536,80 @@ py::object py_dot_blocking(sycl::queue q,
     return res;
 }
 
+int py_cg_solve(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray Amat,
+                dpctl::tensor::usm_ndarray bvec,
+                dpctl::tensor::usm_ndarray xvec,
+                double rs_tol,
+                const std::vector<sycl::event> &depends = {})
+{
+    if (Amat.get_ndim() != 2 || bvec.get_ndim() != 1 || xvec.get_ndim() != 1) {
+        throw py::value_error("Expecting a matrix and two vectors");
+    }
+
+    py::ssize_t n0 = Amat.get_shape(0);
+    py::ssize_t n1 = Amat.get_shape(1);
+
+    if (n0 != n1) {
+        throw py::value_error("Matrix must be square.");
+    }
+
+    if (n0 != bvec.get_shape(0) || n0 != xvec.get_shape(0)) {
+        throw py::value_error(
+            "Dimensions of the matrix and vectors are not consistent.");
+    }
+
+    bool all_contig = (Amat.get_flags() & USM_ARRAY_C_CONTIGUOUS) &&
+                      (bvec.get_flags() & USM_ARRAY_C_CONTIGUOUS) &&
+                      (xvec.get_flags() & USM_ARRAY_C_CONTIGUOUS);
+    if (!all_contig) {
+        throw py::value_error("All inputs must be C-contiguous");
+    }
+
+    int A_typenum = Amat.get_typenum();
+    int b_typenum = bvec.get_typenum();
+    int x_typenum = xvec.get_typenum();
+
+    if (A_typenum != b_typenum || A_typenum != x_typenum) {
+        throw py::value_error("All arrays must have the same type");
+    }
+
+    if (exec_q.get_context() != Amat.get_queue().get_context() ||
+        exec_q.get_context() != bvec.get_queue().get_context() ||
+        exec_q.get_context() != xvec.get_queue().get_context())
+    {
+        throw std::runtime_error(
+            "USM allocations are not bound to context in execution queue");
+    }
+
+    const char *A_ch = Amat.get_data();
+    const char *b_ch = bvec.get_data();
+    char *x_ch = xvec.get_data();
+
+    if (A_typenum == UAR_DOUBLE) {
+        using T = double;
+        int iters = cg_solver::cg_solve<T>(
+            exec_q, n0, reinterpret_cast<const T *>(A_ch),
+            reinterpret_cast<const T *>(b_ch), reinterpret_cast<T *>(x_ch),
+            depends, static_cast<T>(rs_tol));
+
+        return iters;
+    }
+    else if (A_typenum == UAR_FLOAT) {
+        using T = float;
+        int iters = cg_solver::cg_solve<T>(
+            exec_q, n0, reinterpret_cast<const T *>(A_ch),
+            reinterpret_cast<const T *>(b_ch), reinterpret_cast<T *>(x_ch),
+            depends, static_cast<T>(rs_tol));
+
+        return iters;
+    }
+    else {
+        throw std::runtime_error(
+            "Unsupported data type. Use single or double precision.");
+    }
+}
+
 PYBIND11_MODULE(_onemkl, m)
 {
     // Import the dpctl extensions
@@ -518,4 +628,10 @@ PYBIND11_MODULE(_onemkl, m)
           py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list());
     m.def("dot_blocking", &py_dot_blocking, "<v1, v2>", py::arg("exec_queue"),
           py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list());
+
+    m.def("cpp_cg_solve", &py_cg_solve,
+          "Dispatch to call C++ implementation of cg_solve",
+          py::arg("exec_queue"), py::arg("Amat"), py::arg("bvec"),
+          py::arg("xvec"), py::arg("rs_squared_tolerance") = py::float_(1e-20),
+          py::arg("depends") = py::list());
 }
diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
@@ -74,3 +74,19 @@
 rs = sycl_gemm.norm_squared_blocking(q, delta)
 dpctl.SyclEvent.wait_for([hev, hev2])
 print(f"Python solution residual norm squared: {rs}")
+
+x_cpp = dpt.empty_like(b)
+iters = []
+for i in range(6):
+    with timer(api_dev.sycl_queue):
+        conv_in = sycl_gemm.cpp_cg_solve(q, A, b, x_cpp)
+
+    print(i, "(host_dt, device_dt)=", timer.dt)
+    iters.append(conv_in)
+
+print("Converged in: ", iters)
+hev, ev = sycl_gemm.gemv(q, A, x_cpp, r)
+hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev])
+rs = sycl_gemm.norm_squared_blocking(q, delta)
+dpctl.SyclEvent.wait_for([hev, hev2])
+print(f"cpp_cg_solve solution residual norm squared: {rs}")