Skip to content

Commit 7d59843

Browse files
Also use contig kernel if simplified iter is 1d and has unit strides
Example where it helps: ``` In [1]: import dpctl, dpctl.tensor as dpt In [2]: x = dpt.arange(1234*7873, dtype=dpt.int32) In [3]: xx = dpt.permute_dims(dpt.reshape(x, (2, 617, 7873)), (1,2,0)) In [4]: yy = dpt.permute_dims(dpt.reshape(dpt.empty_like(x, dtype="f4"), (2, 617, 7873)), (1,2,0)) In [5]: %timeit yy[...] = xx 1.07 ms ± 93.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each) ``` in master the time is about 2.8 ms on Iris Xe.
1 parent 2669110 commit 7d59843

File tree

1 file changed

+19
-8
lines changed

1 file changed

+19
-8
lines changed

dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
157157
src_nelems * src_elem_size, depends);
158158
}
159159
else {
160-
auto fn =
160+
auto contig_fn =
161161
copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id];
162162
copy_ev =
163163
contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
@@ -166,8 +166,6 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
166166
return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
167167
copy_ev);
168168
}
169-
// With contract_iter2 in place, there is no need to write
170-
// dedicated kernels for casting between contiguous arrays
171169

172170
const py::ssize_t *src_strides = src.get_strides_raw();
173171
const py::ssize_t *dst_strides = dst.get_strides_raw();
@@ -202,11 +200,24 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
202200
std::array<py::ssize_t, 1> dst_strides_arr = {
203201
(dst_strides ? dst_strides[0] : 1)};
204202

205-
auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
206-
sycl::event copy_and_cast_1d_event = fn(
207-
exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
208-
src_data, src_offset, dst_data, dst_offset, depends);
209-
203+
sycl::event copy_and_cast_1d_event;
204+
if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) &&
205+
(src_offset == 0) && (dst_offset == 0))
206+
{
207+
auto contig_fn =
208+
copy_and_cast_contig_dispatch_table[dst_type_id]
209+
[src_type_id];
210+
sycl::event copy_and_cast_1d_event =
211+
contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
212+
}
213+
else {
214+
auto fn =
215+
copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
216+
copy_and_cast_1d_event =
217+
fn(exec_q, src_nelems, shape_arr, src_strides_arr,
218+
dst_strides_arr, src_data, src_offset, dst_data,
219+
dst_offset, depends);
220+
}
210221
return std::make_pair(
211222
keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
212223
copy_and_cast_1d_event);

0 commit comments

Comments
 (0)