Minor changes

npolina4 · npolina4 · commit 82b4186d06ab · 2022-09-16T15:08:03.000-05:00
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -1888,7 +1888,7 @@ typedef sycl::event (*tri_fn_ptr_t)(sycl::queue,
                                     char *,        // dst_data_ptr
                                     py::ssize_t,   // nd
                                     py::ssize_t *, // shape_and_strides
-                                    int,           // k
+                                    py::ssize_t,   // k
                                     const std::vector<sycl::event> &,
                                     const std::vector<sycl::event> &);
 
@@ -1901,7 +1901,7 @@ sycl::event tri_impl(sycl::queue exec_q,
                      char *dst_p,
                      py::ssize_t nd,
                      py::ssize_t *shape_and_strides,
-                     int k,
+                     py::ssize_t k,
                      const std::vector<sycl::event> &depends,
                      const std::vector<sycl::event> &additional_depends)
 {
@@ -1917,12 +1917,15 @@ sycl::event tri_impl(sycl::queue exec_q,
         cgh.depends_on(depends);
         cgh.depends_on(additional_depends);
         cgh.parallel_for<tri_kernel<Ty, l>>(
-            sycl::range<2>(inner_range, outer_range), [=](sycl::item<2> idx) {
+            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
+                py::ssize_t outer_gid = idx[0] / inner_range;
+                py::ssize_t inner_gid = idx[0] - inner_range * outer_gid;
+
                 py::ssize_t src_inner_offset, dst_inner_offset;
                 bool to_copy;
 
                 {
-                    py::ssize_t inner_gid = idx.get_id(0);
+                    // py::ssize_t inner_gid = idx.get_id(0);
                     CIndexer_array<d2, py::ssize_t> indexer_i(
                         {shape_and_strides[nd_2], shape_and_strides[nd_1]});
                     indexer_i.set(inner_gid);
@@ -1943,7 +1946,7 @@ sycl::event tri_impl(sycl::queue exec_q,
                 py::ssize_t src_offset = 0;
                 py::ssize_t dst_offset = 0;
                 {
-                    py::ssize_t outer_gid = idx.get_id(1);
+                    // py::ssize_t outer_gid = idx.get_id(1);
                     CIndexer_vector<py::ssize_t> outer(nd - d2);
                     outer.get_displacement(
                         outer_gid, shape_and_strides, shape_and_strides + src_s,
@@ -1986,7 +1989,7 @@ tri(sycl::queue &exec_q,
     dpctl::tensor::usm_ndarray src,
     dpctl::tensor::usm_ndarray dst,
     char part,
-    int k = 0,
+    py::ssize_t k = 0,
     const std::vector<sycl::event> &depends = {})
 {
     // array dimensions must be the same
@@ -2007,7 +2010,7 @@ tri(sycl::queue &exec_q,
     bool shapes_equal(true);
     size_t src_nelems(1);
 
-    for (int i = 0; i < src_nd; ++i) {
+    for (int i = 0; shapes_equal && i < src_nd; ++i) {
         src_nelems *= static_cast<size_t>(src_shape[i]);
         shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
     }
@@ -2032,11 +2035,7 @@ tri(sycl::queue &exec_q,
     sycl::queue src_q = src.get_queue();
     sycl::queue dst_q = dst.get_queue();
 
-    sycl::context exec_ctx = exec_q.get_context();
-    sycl::device exec_d = exec_q.get_device();
-    if (src_q.get_context() != exec_ctx || dst_q.get_context() != exec_ctx ||
-        src_q.get_device() != exec_d || dst_q.get_device() != exec_d)
-    {
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
         throw py::value_error(
             "Execution queue context is not the same as allocation contexts");
     }
@@ -2060,9 +2059,8 @@ tri(sycl::queue &exec_q,
         }
     }
     else {
-        for (ssize_t i = 0; i < src_nd; i++) {
-            src_strides[i] = src_strides_raw[i];
-        }
+        std::copy(src_strides_raw, src_strides_raw + src_nd,
+                  src_strides.begin());
     }
 
     int dst_flags = dst.get_flags();
@@ -2083,9 +2081,8 @@ tri(sycl::queue &exec_q,
         }
     }
     else {
-        for (ssize_t i = 0; i < src_nd; i++) {
-            dst_strides[i] = dst_strides_raw[i];
-        }
+        std::copy(dst_strides_raw, dst_strides_raw + dst_nd,
+                  dst_strides.begin());
     }
 
     shT simplified_shape;
@@ -2099,14 +2096,20 @@ tri(sycl::queue &exec_q,
 
     int nd = src_nd - 2;
     const py::ssize_t *shape = src_shape;
-    const py::ssize_t *p_src_strides = &src_strides[0];
-    const py::ssize_t *p_dst_strides = &dst_strides[0];
+    const py::ssize_t *p_src_strides = src_strides.data();
+    ;
+    const py::ssize_t *p_dst_strides = dst_strides.data();
+    ;
     simplify_iteration_space(nd, shape, p_src_strides, src_itemsize,
                              is_src_c_contig, is_src_f_contig, p_dst_strides,
                              dst_itemsize, is_dst_c_contig, is_dst_f_contig,
                              simplified_shape, simplified_src_strides,
                              simplified_dst_strides, src_offset, dst_offset);
 
+    if (src_offset != 0 || dst_offset != 0) {
+        throw py::value_error("Reversed slice for dst is not supported");
+    }
+
     nd += 2;
     std::vector<py::ssize_t> shape_and_strides(3 * nd);
 
@@ -2123,7 +2126,7 @@ tri(sycl::queue &exec_q,
     shape_and_strides[3 * nd - 2] = dst_strides[src_nd - 2];
     shape_and_strides[3 * nd - 1] = dst_strides[src_nd - 1];
 
-    std::shared_ptr<shT> shp_shape_and_strides =
+    std::shared_ptr<shT> shp_host_shape_and_strides =
         std::make_shared<shT>(shape_and_strides);
 
     py::ssize_t *dev_shape_and_strides =
@@ -2132,7 +2135,7 @@ tri(sycl::queue &exec_q,
         throw std::runtime_error("Unabled to allocate device memory");
     }
     sycl::event copy_shape_and_strides = exec_q.copy<ssize_t>(
-        shp_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
+        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
 
     py::ssize_t inner_range =
         shape_and_strides[nd - 1] * shape_and_strides[nd - 2];
@@ -2155,9 +2158,12 @@ tri(sycl::queue &exec_q,
     exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on({tri_ev});
         auto ctx = exec_q.get_context();
-        cgh.host_task([shp_shape_and_strides, dev_shape_and_strides, ctx]() {
-            sycl::free(dev_shape_and_strides, ctx);
-        });
+        cgh.host_task(
+            [shp_host_shape_and_strides, dev_shape_and_strides, ctx]() {
+                // capture of shp_host_shape_and_strides ensure the underlying
+                // vector exists for the entire execution of copying kernel
+                sycl::free(dev_shape_and_strides, ctx);
+            });
     });
     return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}),
                           tri_ev);
@@ -2373,23 +2379,25 @@ PYBIND11_MODULE(_tensor_impl, m)
           });
     m.def(
         "_tril",
-        [](sycl::queue exec_q, dpctl::tensor::usm_ndarray src,
-           dpctl::tensor::usm_ndarray dst, int k,
+        [](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst,
+           py::ssize_t k, sycl::queue exec_q,
            const std::vector<sycl::event> depends)
             -> std::pair<sycl::event, sycl::event> {
             return tri(exec_q, src, dst, 'l', k, depends);
         },
-        "Tril helper function.", py::arg("sycl_queue"), py::arg("src"),
-        py::arg("dst"), py::arg("k") = 0, py::arg("depends") = py::list());
+        "Tril helper function.", py::arg("src"), py::arg("dst"),
+        py::arg("k") = 0, py::arg("sycl_queue"),
+        py::arg("depends") = py::list());
 
     m.def(
         "_triu",
-        [](sycl::queue exec_q, dpctl::tensor::usm_ndarray src,
-           dpctl::tensor::usm_ndarray dst, int k,
+        [](dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst,
+           py::ssize_t k, sycl::queue exec_q,
            const std::vector<sycl::event> depends)
             -> std::pair<sycl::event, sycl::event> {
             return tri(exec_q, src, dst, 'u', k, depends);
         },
-        "Triu helper function.", py::arg("sycl_queue"), py::arg("src"),
-        py::arg("dst"), py::arg("k") = 0, py::arg("depends") = py::list());
+        "Triu helper function.", py::arg("src"), py::arg("dst"),
+        py::arg("k") = 0, py::arg("sycl_queue"),
+        py::arg("depends") = py::list());
 }