Skip to content

Simplify iteration tweak #1032

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ template <typename srcT, typename dstT> class Caster
{
public:
Caster() = default;
void operator()(char *src,
void operator()(const char *src,
std::ptrdiff_t src_offset,
char *dst,
std::ptrdiff_t dst_offset) const
{
using dpctl::tensor::type_utils::convert_impl;

srcT *src_ = reinterpret_cast<srcT *>(src) + src_offset;
const srcT *src_ = reinterpret_cast<const srcT *>(src) + src_offset;
dstT *dst_ = reinterpret_cast<dstT *>(dst) + dst_offset;
*dst_ = convert_impl<dstT, srcT>(*src_);
}
Expand All @@ -67,17 +67,17 @@ template <typename srcT, typename dstT> class Caster
template <typename CastFnT> class GenericCopyFunctor
{
private:
char *src_ = nullptr;
const char *src_ = nullptr;
char *dst_ = nullptr;
py::ssize_t *shape_strides_ = nullptr;
const py::ssize_t *shape_strides_ = nullptr;
int nd_ = 0;
py::ssize_t src_offset0 = 0;
py::ssize_t dst_offset0 = 0;

public:
GenericCopyFunctor(char *src_cp,
GenericCopyFunctor(const char *src_cp,
char *dst_cp,
py::ssize_t *shape_strides,
const py::ssize_t *shape_strides,
int nd,
py::ssize_t src_offset,
py::ssize_t dst_offset)
Expand All @@ -93,13 +93,11 @@ template <typename CastFnT> class GenericCopyFunctor
CIndexer_vector<py::ssize_t> indxr(nd_);
indxr.get_displacement<const py::ssize_t *, const py::ssize_t *>(
static_cast<py::ssize_t>(wiid.get(0)),
const_cast<const py::ssize_t *>(shape_strides_), // common shape
const_cast<const py::ssize_t *>(shape_strides_ +
nd_), // src strides
const_cast<const py::ssize_t *>(shape_strides_ +
2 * nd_), // dst strides
src_offset, // modified by reference
dst_offset // modified by reference
shape_strides_, // common shape
shape_strides_ + nd_, // src strides
shape_strides_ + 2 * nd_, // dst strides
src_offset, // modified by reference
dst_offset // modified by reference
);
CastFnT fn{};
fn(src_, src_offset0 + src_offset, dst_, dst_offset0 + dst_offset);
Expand All @@ -109,7 +107,7 @@ template <typename CastFnT> class GenericCopyFunctor
template <int nd, typename CastFnT> class NDSpecializedCopyFunctor
{
private:
char *src_ = nullptr;
const char *src_ = nullptr;
char *dst_ = nullptr;
CIndexer_array<nd, py::ssize_t> indxr;
const std::array<py::ssize_t, nd> src_strides_;
Expand All @@ -119,8 +117,8 @@ template <int nd, typename CastFnT> class NDSpecializedCopyFunctor
py::ssize_t dst_offset0 = 0;

public:
NDSpecializedCopyFunctor(char *src_cp, // USM pointer
char *dst_cp, // USM pointer
NDSpecializedCopyFunctor(const char *src_cp, // USM pointer
char *dst_cp, // USM pointer
const std::array<py::ssize_t, nd> shape,
const std::array<py::ssize_t, nd> src_strides,
const std::array<py::ssize_t, nd> dst_strides,
Expand All @@ -140,8 +138,10 @@ template <int nd, typename CastFnT> class NDSpecializedCopyFunctor

local_indxr.set(wiid.get(0));
auto mi = local_indxr.get();
#pragma unroll
for (int i = 0; i < nd; ++i)
src_offset += mi[i] * src_strides_[i];
#pragma unroll
for (int i = 0; i < nd; ++i)
dst_offset += mi[i] * dst_strides_[i];

Expand All @@ -161,8 +161,8 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
sycl::queue,
size_t,
int,
py::ssize_t *,
char *,
const py::ssize_t *,
const char *,
py::ssize_t,
char *,
py::ssize_t,
Expand Down Expand Up @@ -207,8 +207,8 @@ sycl::event
copy_and_cast_generic_impl(sycl::queue q,
size_t nelems,
int nd,
py::ssize_t *shape_and_strides,
char *src_p,
const py::ssize_t *shape_and_strides,
const char *src_p,
py::ssize_t src_offset,
char *dst_p,
py::ssize_t dst_offset,
Expand Down Expand Up @@ -256,7 +256,7 @@ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
const std::array<py::ssize_t, 1>,
const std::array<py::ssize_t, 1>,
const std::array<py::ssize_t, 1>,
char *,
const char *,
py::ssize_t,
char *,
py::ssize_t,
Expand All @@ -272,7 +272,7 @@ typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
const std::array<py::ssize_t, 2>,
const std::array<py::ssize_t, 2>,
const std::array<py::ssize_t, 2>,
char *,
const char *,
py::ssize_t,
char *,
py::ssize_t,
Expand Down Expand Up @@ -314,7 +314,7 @@ copy_and_cast_nd_specialized_impl(sycl::queue q,
const std::array<py::ssize_t, nd> shape,
const std::array<py::ssize_t, nd> src_strides,
const std::array<py::ssize_t, nd> dst_strides,
char *src_p,
const char *src_p,
py::ssize_t src_offset,
char *dst_p,
py::ssize_t dst_offset,
Expand Down
10 changes: 5 additions & 5 deletions dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,6 @@ void simplify_iteration_space(int &nd,
simplified_dst_strides.resize(contracted_nd);

nd = contracted_nd;
shape = const_cast<const py::ssize_t *>(simplified_shape.data());
src_strides =
const_cast<const py::ssize_t *>(simplified_src_strides.data());
dst_strides =
const_cast<const py::ssize_t *>(simplified_dst_strides.data());
}
else if (nd == 1) {
// Populate vectors
Expand Down Expand Up @@ -171,6 +166,11 @@ void simplify_iteration_space(int &nd,
assert(simplified_src_strides.size() == static_cast<size_t>(nd));
assert(simplified_dst_strides.size() == static_cast<size_t>(nd));
}
shape = const_cast<const py::ssize_t *>(simplified_shape.data());
src_strides =
const_cast<const py::ssize_t *>(simplified_src_strides.data());
dst_strides =
const_cast<const py::ssize_t *>(simplified_dst_strides.data());
}

} // namespace py_internal
Expand Down
67 changes: 60 additions & 7 deletions dpctl/tensor/libtensor/tests/test_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@
]


def _typestr_has_fp64(arr_typestr):
return arr_typestr in ["f8", "c16"]


def _typestr_has_fp16(arr_typestr):
return arr_typestr in ["f2"]


@pytest.fixture(params=_usm_types_list)
def usm_type(request):
return request.param
Expand Down Expand Up @@ -95,6 +103,14 @@ def test_copy1d_c_contig(src_typestr, dst_typestr):
q = dpctl.SyclQueue()
except dpctl.SyclQueueCreationError:
pytest.skip("Queue could not be created")
if not q.sycl_device.has_aspect_fp64 and (
_typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
):
pytest.skip("Device does not support double precision")
if not q.sycl_device.has_aspect_fp16 and (
_typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
):
pytest.skip("Device does not support half precision")
src_dt = np.dtype(src_typestr)
dst_dt = np.dtype(dst_typestr)
Xnp = _random_vector(4096, src_dt)
Expand All @@ -113,6 +129,14 @@ def test_copy1d_strided(src_typestr, dst_typestr):
q = dpctl.SyclQueue()
except dpctl.SyclQueueCreationError:
pytest.skip("Queue could not be created")
if not q.sycl_device.has_aspect_fp64 and (
_typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
):
pytest.skip("Device does not support double precision")
if not q.sycl_device.has_aspect_fp16 and (
_typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
):
pytest.skip("Device does not support half precision")
src_dt = np.dtype(src_typestr)
dst_dt = np.dtype(dst_typestr)
Xnp = _random_vector(4096, src_dt)
Expand All @@ -131,7 +155,12 @@ def test_copy1d_strided(src_typestr, dst_typestr):
assert are_close(Ynp, dpt.asnumpy(Y))

# now 0-strided source
X = dpt.usm_ndarray((4096,), dtype=src_typestr, strides=(0,))
X = dpt.usm_ndarray(
(4096,),
dtype=src_typestr,
strides=(0,),
buffer_ctor_kwargs={"queue": q},
)
X[0] = Xnp[0]
Y = dpt.empty(X.shape, dtype=dst_typestr, sycl_queue=q)
hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q)
Expand All @@ -145,6 +174,14 @@ def test_copy1d_strided2(src_typestr, dst_typestr):
q = dpctl.SyclQueue()
except dpctl.SyclQueueCreationError:
pytest.skip("Queue could not be created")
if not q.sycl_device.has_aspect_fp64 and (
_typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
):
pytest.skip("Device does not support double precision")
if not q.sycl_device.has_aspect_fp16 and (
_typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
):
pytest.skip("Device does not support half precision")
src_dt = np.dtype(src_typestr)
dst_dt = np.dtype(dst_typestr)
Xnp = _random_vector(4096, src_dt)
Expand Down Expand Up @@ -172,6 +209,14 @@ def test_copy2d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2):
q = dpctl.SyclQueue()
except dpctl.SyclQueueCreationError:
pytest.skip("Queue could not be created")
if not q.sycl_device.has_aspect_fp64 and (
_typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
):
pytest.skip("Device does not support double precision")
if not q.sycl_device.has_aspect_fp16 and (
_typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
):
pytest.skip("Device does not support half precision")

src_dt = np.dtype(src_typestr)
dst_dt = np.dtype(dst_typestr)
Expand All @@ -188,16 +233,16 @@ def test_copy2d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2):
slice(None, None, st1 * sgn1),
slice(None, None, st2 * sgn2),
]
Y = dpt.empty((n1, n2), dtype=dst_dt)
Y = dpt.empty((n1, n2), dtype=dst_dt, device=X.device)
hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q)
Ynp = _force_cast(Xnp, dst_dt)
hev.wait()
assert are_close(Ynp, dpt.asnumpy(Y))
Yst = dpt.empty((2 * n1, n2), dtype=dst_dt)[::2, ::-1]
Yst = dpt.empty((2 * n1, n2), dtype=dst_dt, device=X.device)[::2, ::-1]
hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(
src=X, dst=Yst, sycl_queue=q
)
Y = dpt.empty((n1, n2), dtype=dst_dt)
Y = dpt.empty((n1, n2), dtype=dst_dt, device=X.device)
hev2, ev2 = ti._copy_usm_ndarray_into_usm_ndarray(
src=Yst, dst=Y, sycl_queue=q, depends=[ev]
)
Expand All @@ -220,6 +265,14 @@ def test_copy3d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2, st3, sgn3):
except dpctl.SyclQueueCreationError:
pytest.skip("Queue could not be created")

if not q.sycl_device.has_aspect_fp64 and (
_typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
):
pytest.skip("Device does not support double precision")
if not q.sycl_device.has_aspect_fp16 and (
_typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
):
pytest.skip("Device does not support half precision")
src_dt = np.dtype(src_typestr)
dst_dt = np.dtype(dst_typestr)
n1, n2, n3 = 5, 4, 6
Expand All @@ -237,16 +290,16 @@ def test_copy3d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2, st3, sgn3):
slice(None, None, st2 * sgn2),
slice(None, None, st3 * sgn3),
]
Y = dpt.empty((n1, n2, n3), dtype=dst_dt)
Y = dpt.empty((n1, n2, n3), dtype=dst_dt, device=X.device)
hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q)
Ynp = _force_cast(Xnp, dst_dt)
hev.wait()
assert are_close(Ynp, dpt.asnumpy(Y)), "1"
Yst = dpt.empty((2 * n1, n2, n3), dtype=dst_dt)[::2, ::-1]
Yst = dpt.empty((2 * n1, n2, n3), dtype=dst_dt, device=X.device)[::2, ::-1]
hev2, ev2 = ti._copy_usm_ndarray_into_usm_ndarray(
src=X, dst=Yst, sycl_queue=q
)
Y2 = dpt.empty((n1, n2, n3), dtype=dst_dt)
Y2 = dpt.empty((n1, n2, n3), dtype=dst_dt, device=X.device)
hev3, ev3 = ti._copy_usm_ndarray_into_usm_ndarray(
src=Yst, dst=Y2, sycl_queue=q, depends=[ev2]
)
Expand Down