Skip to content

Commit 30223a2

Browse files
Merge pull request #934 from IntelPython/cleanup-tensor-part5
2 parents 67802a3 + 51fd051 commit 30223a2

20 files changed

+2448
-1422
lines changed

dpctl/tensor/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,22 @@ add_custom_target(_dpctl4pybind11_deps
1818
set(python_module_name _tensor_impl)
1919
pybind11_add_module(${python_module_name} MODULE
2020
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
21+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
22+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
23+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
24+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
25+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
26+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
27+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
28+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
29+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
2130
)
2231
target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
2332
target_include_directories(${python_module_name}
2433
PRIVATE
2534
${CMAKE_CURRENT_SOURCE_DIR}/../include
2635
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
36+
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
2737
)
2838
add_dependencies(${python_module_name} _dpctl4pybind11_deps)
2939
install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
//===-- tensor_py.cpp - Implementation of _tensor_impl module --*-C++-*-/===//
2+
//
3+
// Data Parallel Control (dpctl)
4+
//
5+
// Copyright 2020-2022 Intel Corporation
6+
//
7+
// Licensed under the Apache License, Version 2.0 (the "License");
8+
// you may not use this file except in compliance with the License.
9+
// You may obtain a copy of the License at
10+
//
11+
// http://www.apache.org/licenses/LICENSE-2.0
12+
//
13+
// Unless required by applicable law or agreed to in writing, software
14+
// distributed under the License is distributed on an "AS IS" BASIS,
15+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
// See the License for the specific language governing permissions and
17+
// limitations under the License.
18+
//
19+
//===----------------------------------------------------------------------===//
20+
///
21+
/// \file
22+
/// This file defines functions of dpctl.tensor._tensor_impl extensions
23+
//===----------------------------------------------------------------------===//
24+
25+
#include <CL/sycl.hpp>
26+
#include <algorithm>
27+
#include <complex>
28+
#include <cstdint>
29+
#include <pybind11/complex.h>
30+
#include <pybind11/numpy.h>
31+
#include <pybind11/pybind11.h>
32+
#include <pybind11/stl.h>
33+
#include <thread>
34+
#include <type_traits>
35+
#include <utility>
36+
37+
#include "dpctl4pybind11.hpp"
38+
#include "kernels/copy_and_cast.hpp"
39+
#include "utils/type_dispatch.hpp"
40+
#include "utils/type_utils.hpp"
41+
42+
#include "simplify_iteration_space.hpp"
43+
44+
namespace dpctl
45+
{
46+
namespace tensor
47+
{
48+
namespace py_internal
49+
{
50+
51+
namespace _ns = dpctl::tensor::detail;
52+
53+
using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t;
54+
using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_2d_fn_ptr_t;
55+
using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t;
56+
57+
static copy_and_cast_generic_fn_ptr_t
58+
copy_and_cast_generic_dispatch_table[_ns::num_types][_ns::num_types];
59+
static copy_and_cast_1d_fn_ptr_t
60+
copy_and_cast_1d_dispatch_table[_ns::num_types][_ns::num_types];
61+
static copy_and_cast_2d_fn_ptr_t
62+
copy_and_cast_2d_dispatch_table[_ns::num_types][_ns::num_types];
63+
64+
namespace py = pybind11;
65+
66+
using dpctl::tensor::c_contiguous_strides;
67+
using dpctl::tensor::f_contiguous_strides;
68+
69+
using dpctl::utils::keep_args_alive;
70+
71+
sycl::event _populate_packed_shape_strides_for_copycast_kernel(
72+
sycl::queue exec_q,
73+
py::ssize_t *device_shape_strides, // to be populated
74+
const std::vector<py::ssize_t> &common_shape,
75+
const std::vector<py::ssize_t> &src_strides,
76+
const std::vector<py::ssize_t> &dst_strides)
77+
{
78+
// memory transfer optimization, use USM-host for temporary speeds up
79+
// tranfer to device, especially on dGPUs
80+
using usm_host_allocatorT =
81+
sycl::usm_allocator<py::ssize_t, sycl::usm::alloc::host>;
82+
using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
83+
size_t nd = common_shape.size();
84+
85+
usm_host_allocatorT allocator(exec_q);
86+
87+
// create host temporary for packed shape and strides managed by shared
88+
// pointer. Packed vector is concatenation of common_shape, src_stride and
89+
// std_strides
90+
std::shared_ptr<shT> shp_host_shape_strides =
91+
std::make_shared<shT>(3 * nd, allocator);
92+
std::copy(common_shape.begin(), common_shape.end(),
93+
shp_host_shape_strides->begin());
94+
95+
std::copy(src_strides.begin(), src_strides.end(),
96+
shp_host_shape_strides->begin() + nd);
97+
98+
std::copy(dst_strides.begin(), dst_strides.end(),
99+
shp_host_shape_strides->begin() + 2 * nd);
100+
101+
sycl::event copy_shape_ev = exec_q.copy<py::ssize_t>(
102+
shp_host_shape_strides->data(), device_shape_strides,
103+
shp_host_shape_strides->size());
104+
105+
exec_q.submit([&](sycl::handler &cgh) {
106+
cgh.depends_on(copy_shape_ev);
107+
cgh.host_task([shp_host_shape_strides]() {
108+
// increment shared pointer ref-count to keep it alive
109+
// till copy operation completes;
110+
});
111+
});
112+
113+
return copy_shape_ev;
114+
}
115+
116+
std::pair<sycl::event, sycl::event>
117+
copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
118+
dpctl::tensor::usm_ndarray dst,
119+
sycl::queue exec_q,
120+
const std::vector<sycl::event> &depends = {})
121+
{
122+
// array dimensions must be the same
123+
int src_nd = src.get_ndim();
124+
int dst_nd = dst.get_ndim();
125+
126+
if (src_nd != dst_nd) {
127+
throw py::value_error("Array dimensions are not the same.");
128+
}
129+
130+
// shapes must be the same
131+
const py::ssize_t *src_shape = src.get_shape_raw();
132+
const py::ssize_t *dst_shape = dst.get_shape_raw();
133+
134+
bool shapes_equal(true);
135+
size_t src_nelems(1);
136+
137+
for (int i = 0; i < src_nd; ++i) {
138+
src_nelems *= static_cast<size_t>(src_shape[i]);
139+
shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
140+
}
141+
if (!shapes_equal) {
142+
throw py::value_error("Array shapes are not the same.");
143+
}
144+
145+
if (src_nelems == 0) {
146+
// nothing to do
147+
return std::make_pair(sycl::event(), sycl::event());
148+
}
149+
150+
auto dst_offsets = dst.get_minmax_offsets();
151+
// destination must be ample enough to accomodate all elements
152+
{
153+
size_t range =
154+
static_cast<size_t>(dst_offsets.second - dst_offsets.first);
155+
if (range + 1 < src_nelems) {
156+
throw py::value_error(
157+
"Destination array can not accomodate all the "
158+
"elements of source array.");
159+
}
160+
}
161+
162+
// check compatibility of execution queue and allocation queue
163+
sycl::queue src_q = src.get_queue();
164+
sycl::queue dst_q = dst.get_queue();
165+
166+
if (!dpctl::utils::queues_are_compatible(exec_q, {src_q, dst_q})) {
167+
throw py::value_error(
168+
"Execution queue is not compatible with allocation queues");
169+
}
170+
171+
int src_typenum = src.get_typenum();
172+
int dst_typenum = dst.get_typenum();
173+
174+
auto array_types = dpctl::tensor::detail::usm_ndarray_types();
175+
int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
176+
int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
177+
178+
char *src_data = src.get_data();
179+
char *dst_data = dst.get_data();
180+
181+
// check that arrays do not overlap, and concurrent copying is safe.
182+
auto src_offsets = src.get_minmax_offsets();
183+
int src_elem_size = src.get_elemsize();
184+
int dst_elem_size = dst.get_elemsize();
185+
186+
bool memory_overlap =
187+
((dst_data - src_data > src_offsets.second * src_elem_size -
188+
dst_offsets.first * dst_elem_size) &&
189+
(src_data - dst_data > dst_offsets.second * dst_elem_size -
190+
src_offsets.first * src_elem_size));
191+
if (memory_overlap) {
192+
// TODO: could use a temporary, but this is done by the caller
193+
throw py::value_error("Arrays index overlapping segments of memory");
194+
}
195+
196+
bool is_src_c_contig = src.is_c_contiguous();
197+
bool is_src_f_contig = src.is_f_contiguous();
198+
199+
bool is_dst_c_contig = dst.is_c_contiguous();
200+
bool is_dst_f_contig = dst.is_f_contiguous();
201+
202+
// check for applicability of special cases:
203+
// (same type && (both C-contiguous || both F-contiguous)
204+
bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
205+
bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
206+
if (both_c_contig || both_f_contig) {
207+
if (src_type_id == dst_type_id) {
208+
209+
sycl::event copy_ev =
210+
exec_q.memcpy(static_cast<void *>(dst_data),
211+
static_cast<const void *>(src_data),
212+
src_nelems * src_elem_size, depends);
213+
214+
// make sure src and dst are not GC-ed before copy_ev is complete
215+
return std::make_pair(
216+
keep_args_alive(exec_q, {src, dst}, {copy_ev}), copy_ev);
217+
}
218+
// With contract_iter2 in place, there is no need to write
219+
// dedicated kernels for casting between contiguous arrays
220+
}
221+
222+
const py::ssize_t *src_strides = src.get_strides_raw();
223+
const py::ssize_t *dst_strides = dst.get_strides_raw();
224+
225+
using shT = std::vector<py::ssize_t>;
226+
shT simplified_shape;
227+
shT simplified_src_strides;
228+
shT simplified_dst_strides;
229+
py::ssize_t src_offset(0);
230+
py::ssize_t dst_offset(0);
231+
232+
int nd = src_nd;
233+
const py::ssize_t *shape = src_shape;
234+
235+
constexpr py::ssize_t src_itemsize = 1; // in elements
236+
constexpr py::ssize_t dst_itemsize = 1; // in elements
237+
238+
// all args except itemsizes and is_?_contig bools can be modified by
239+
// reference
240+
dpctl::tensor::py_internal::simplify_iteration_space(
241+
nd, shape, src_strides, src_itemsize, is_src_c_contig, is_src_f_contig,
242+
dst_strides, dst_itemsize, is_dst_c_contig, is_dst_f_contig,
243+
simplified_shape, simplified_src_strides, simplified_dst_strides,
244+
src_offset, dst_offset);
245+
246+
if (nd < 3) {
247+
if (nd == 1) {
248+
std::array<py::ssize_t, 1> shape_arr = {shape[0]};
249+
// strides may be null
250+
std::array<py::ssize_t, 1> src_strides_arr = {
251+
(src_strides ? src_strides[0] : 1)};
252+
std::array<py::ssize_t, 1> dst_strides_arr = {
253+
(dst_strides ? dst_strides[0] : 1)};
254+
255+
auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
256+
sycl::event copy_and_cast_1d_event = fn(
257+
exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
258+
src_data, src_offset, dst_data, dst_offset, depends);
259+
260+
return std::make_pair(
261+
keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
262+
copy_and_cast_1d_event);
263+
}
264+
else if (nd == 2) {
265+
std::array<py::ssize_t, 2> shape_arr = {shape[0], shape[1]};
266+
std::array<py::ssize_t, 2> src_strides_arr = {src_strides[0],
267+
src_strides[1]};
268+
std::array<py::ssize_t, 2> dst_strides_arr = {dst_strides[0],
269+
dst_strides[1]};
270+
271+
auto fn = copy_and_cast_2d_dispatch_table[dst_type_id][src_type_id];
272+
273+
sycl::event copy_and_cast_2d_event = fn(
274+
exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
275+
src_data, src_offset, dst_data, dst_offset, depends);
276+
277+
return std::make_pair(
278+
keep_args_alive(exec_q, {src, dst}, {copy_and_cast_2d_event}),
279+
copy_and_cast_2d_event);
280+
}
281+
else if (nd == 0) { // case of a scalar
282+
assert(src_nelems == 1);
283+
std::array<py::ssize_t, 1> shape_arr = {1};
284+
std::array<py::ssize_t, 1> src_strides_arr = {1};
285+
std::array<py::ssize_t, 1> dst_strides_arr = {1};
286+
287+
auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
288+
289+
sycl::event copy_and_cast_0d_event = fn(
290+
exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
291+
src_data, src_offset, dst_data, dst_offset, depends);
292+
293+
return std::make_pair(
294+
keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}),
295+
copy_and_cast_0d_event);
296+
}
297+
}
298+
299+
// Generic implementation
300+
auto copy_and_cast_fn =
301+
copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id];
302+
303+
// If shape/strides are accessed with accessors, buffer destructor
304+
// will force syncronization.
305+
py::ssize_t *shape_strides =
306+
sycl::malloc_device<py::ssize_t>(3 * nd, exec_q);
307+
308+
if (shape_strides == nullptr) {
309+
throw std::runtime_error("Unabled to allocate device memory");
310+
}
311+
312+
sycl::event copy_shape_ev =
313+
_populate_packed_shape_strides_for_copycast_kernel(
314+
exec_q, shape_strides, simplified_shape, simplified_src_strides,
315+
simplified_dst_strides);
316+
317+
sycl::event copy_and_cast_generic_ev = copy_and_cast_fn(
318+
exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
319+
dst_offset, depends, {copy_shape_ev});
320+
321+
// async free of shape_strides temporary
322+
auto ctx = exec_q.get_context();
323+
exec_q.submit([&](sycl::handler &cgh) {
324+
cgh.depends_on(copy_and_cast_generic_ev);
325+
cgh.host_task(
326+
[ctx, shape_strides]() { sycl::free(shape_strides, ctx); });
327+
});
328+
329+
return std::make_pair(
330+
keep_args_alive(exec_q, {src, dst}, {copy_and_cast_generic_ev}),
331+
copy_and_cast_generic_ev);
332+
}
333+
334+
void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
335+
{
336+
using namespace dpctl::tensor::detail;
337+
338+
using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory;
339+
DispatchTableBuilder<copy_and_cast_generic_fn_ptr_t,
340+
CopyAndCastGenericFactory, num_types>
341+
dtb_generic;
342+
dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table);
343+
344+
using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory;
345+
DispatchTableBuilder<copy_and_cast_1d_fn_ptr_t, CopyAndCast1DFactory,
346+
num_types>
347+
dtb_1d;
348+
dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
349+
350+
using dpctl::tensor::kernels::copy_and_cast::CopyAndCast2DFactory;
351+
DispatchTableBuilder<copy_and_cast_2d_fn_ptr_t, CopyAndCast2DFactory,
352+
num_types>
353+
dtb_2d;
354+
dtb_2d.populate_dispatch_table(copy_and_cast_2d_dispatch_table);
355+
}
356+
357+
} // namespace py_internal
358+
} // namespace tensor
359+
} // namespace dpctl

0 commit comments

Comments
 (0)