Skip to content

Simplify copy and cast kernels #1165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
255 changes: 202 additions & 53 deletions dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,51 +46,52 @@ using namespace dpctl::tensor::offset_utils;

template <typename srcT, typename dstT, typename IndexerT>
class copy_cast_generic_kernel;

template <typename srcT,
typename dstT,
unsigned int vec_sz,
unsigned int n_vecs>
class copy_cast_contig_kernel;

template <typename srcT, typename dstT, typename IndexerT>
class copy_cast_from_host_kernel;
// template <typename srcT, typename dstT, typename IndexerT>
// class copy_cast_spec_kernel;

template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
class copy_for_reshape_generic_kernel;

template <typename srcT, typename dstT> class Caster
template <typename srcTy, typename dstTy> class Caster
{
public:
Caster() = default;
void operator()(const char *src,
std::ptrdiff_t src_offset,
char *dst,
std::ptrdiff_t dst_offset) const
dstTy operator()(const srcTy &src) const
{
using dpctl::tensor::type_utils::convert_impl;

const srcT *src_ = reinterpret_cast<const srcT *>(src) + src_offset;
dstT *dst_ = reinterpret_cast<dstT *>(dst) + dst_offset;
*dst_ = convert_impl<dstT, srcT>(*src_);
return convert_impl<dstTy, srcTy>(src);
}
};

template <typename CastFnT, typename IndexerT> class GenericCopyFunctor
template <typename srcT, typename dstT, typename CastFnT, typename IndexerT>
class GenericCopyFunctor
{
private:
const char *src_ = nullptr;
char *dst_ = nullptr;
const srcT *src_ = nullptr;
dstT *dst_ = nullptr;
IndexerT indexer_;

public:
GenericCopyFunctor(const char *src_cp, char *dst_cp, IndexerT indexer)
: src_(src_cp), dst_(dst_cp), indexer_(indexer)
GenericCopyFunctor(const srcT *src_p, dstT *dst_p, IndexerT indexer)
: src_(src_p), dst_(dst_p), indexer_(indexer)
{
}

void operator()(sycl::id<1> wiid) const
{
auto offsets = indexer_(static_cast<py::ssize_t>(wiid.get(0)));
py::ssize_t src_offset = offsets.get_first_offset();
py::ssize_t dst_offset = offsets.get_second_offset();
const auto &offsets = indexer_(static_cast<py::ssize_t>(wiid.get(0)));
const py::ssize_t &src_offset = offsets.get_first_offset();
const py::ssize_t &dst_offset = offsets.get_second_offset();

CastFnT fn{};
fn(src_, src_offset, dst_, dst_offset);
dst_[dst_offset] = fn(src_[src_offset]);
}
};

Expand Down Expand Up @@ -168,12 +169,15 @@ copy_and_cast_generic_impl(sycl::queue q,

TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset,
shape_and_strides};
const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);

cgh.parallel_for<class copy_cast_generic_kernel<
srcTy, dstTy, TwoOffsets_StridedIndexer>>(
sycl::range<1>(nelems),
GenericCopyFunctor<Caster<srcTy, dstTy>, TwoOffsets_StridedIndexer>(
src_p, dst_p, indexer));
GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>,
TwoOffsets_StridedIndexer>(src_tp, dst_tp,
indexer));
});

return copy_and_cast_ev;
Expand All @@ -193,6 +197,160 @@ template <typename fnT, typename D, typename S> struct CopyAndCastGenericFactory
}
};

// Specialization of copy_and_cast for contiguous arrays

template <typename srcT,
typename dstT,
typename CastFnT,
int vec_sz = 4,
int n_vecs = 2>
class ContigCopyFunctor
{
private:
const size_t nelems;
const srcT *src_p = nullptr;
dstT *dst_p = nullptr;

public:
ContigCopyFunctor(const size_t nelems_, const srcT *src_p_, dstT *dst_p_)
: nelems(nelems_), src_p(src_p_), dst_p(dst_p_)
{
}

void operator()(sycl::nd_item<1> ndit) const
{
CastFnT fn{};

using dpctl::tensor::type_utils::is_complex;
if constexpr (is_complex<srcT>::value || is_complex<dstT>::value) {
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
size_t base = ndit.get_global_linear_id();

base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize);
for (size_t offset = base;
offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
offset += sgSize)
{
dst_p[offset] = fn(src_p[offset]);
}
}
else {
auto sg = ndit.get_sub_group();
std::uint8_t sgSize = sg.get_local_range()[0];
std::uint8_t max_sgSize = sg.get_max_local_range()[0];
size_t base = n_vecs * vec_sz *
(ndit.get_group(0) * ndit.get_local_range(0) +
sg.get_group_id()[0] * max_sgSize);

if (base + n_vecs * vec_sz * sgSize < nelems &&
sgSize == max_sgSize) {
using src_ptrT =
sycl::multi_ptr<const srcT,
sycl::access::address_space::global_space>;
using dst_ptrT =
sycl::multi_ptr<dstT,
sycl::access::address_space::global_space>;
sycl::vec<srcT, vec_sz> src_vec;
sycl::vec<dstT, vec_sz> dst_vec;

#pragma unroll
for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
src_vec =
sg.load<vec_sz>(src_ptrT(&src_p[base + it * sgSize]));
#pragma unroll
for (std::uint8_t k = 0; k < vec_sz; k++) {
dst_vec[k] = fn(src_vec[k]);
}
sg.store<vec_sz>(dst_ptrT(&dst_p[base + it * sgSize]),
dst_vec);
}
}
else {
for (size_t k = base + sg.get_local_id()[0]; k < nelems;
k += sgSize) {
dst_p[k] = fn(src_p[k]);
}
}
}
}
};

/*!
* @brief Function pointer type for contiguous array cast and copy function.
*/
typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)(
sycl::queue,
size_t,
const char *,
char *,
const std::vector<sycl::event> &);

/*!
* @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray
to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`.

Both arrays have the same number of elements `nelems`.
`src_cp` and `dst_cp` represent char pointers to the start of respective
arrays. Kernel is submitted to sycl queue `q` with events `depends` as
dependencies.

@param q Sycl queue to which the kernel is submitted.
@param nelems Number of elements to cast and copy.
@param src_p Kernel accessible USM pointer for the source array
@param dst_p Kernel accessible USM pointer for the destination array
@param depends List of events to wait for before starting computations, if
any.

@return Event to wait on to ensure that computation completes.
@ingroup CopyAndCastKernels
*/
template <typename dstTy, typename srcTy>
sycl::event copy_and_cast_contig_impl(sycl::queue q,
size_t nelems,
const char *src_cp,
char *dst_cp,
const std::vector<sycl::event> &depends)
{
dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);

sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_cp);
dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);

size_t lws = 64;
constexpr unsigned int vec_sz = 4;
constexpr unsigned int n_vecs = 2;
const size_t n_groups =
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

cgh.parallel_for<copy_cast_contig_kernel<srcTy, dstTy, n_vecs, vec_sz>>(
sycl::nd_range<1>(gws_range, lws_range),
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
n_vecs>(nelems, src_tp, dst_tp));
});

return copy_and_cast_ev;
}

/*!
* @brief Factory to get specialized function pointer for casting and copying
* contiguous arrays.
* @ingroup CopyAndCastKernels
*/
template <typename fnT, typename D, typename S> struct CopyAndCastContigFactory
{
fnT get()
{
fnT f = copy_and_cast_contig_impl<D, S>;
return f;
}
};

// Specialization of copy_and_cast for 1D arrays

/*!
Expand Down Expand Up @@ -276,13 +434,15 @@ copy_and_cast_nd_specialized_impl(sycl::queue q,
using IndexerT = TwoOffsets_FixedDimStridedIndexer<nd>;
IndexerT indexer{shape, src_strides, dst_strides, src_offset,
dst_offset};
const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);

cgh.depends_on(depends);
cgh.parallel_for<
class copy_cast_generic_kernel<srcTy, dstTy, IndexerT>>(
sycl::range<1>(nelems),
GenericCopyFunctor<Caster<srcTy, dstTy>, IndexerT>(src_p, dst_p,
indexer));
GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, IndexerT>(
src_tp, dst_tp, indexer));
});

return copy_and_cast_ev;
Expand Down Expand Up @@ -318,46 +478,33 @@ template <typename fnT, typename D, typename S> struct CopyAndCast2DFactory

// ====================== Copying from host to USM

template <typename srcT, typename dstT, typename AccessorT>
class CasterForAccessor
{
public:
CasterForAccessor() = default;
void operator()(AccessorT src,
std::ptrdiff_t src_offset,
char *dst,
std::ptrdiff_t dst_offset) const
{
using dpctl::tensor::type_utils::convert_impl;

dstT *dst_ = reinterpret_cast<dstT *>(dst) + dst_offset;
*dst_ = convert_impl<dstT, srcT>(src[src_offset]);
}
};

template <typename CastFnT, typename AccessorT, typename IndexerT>
template <typename AccessorT,
typename dstTy,
typename CastFnT,
typename IndexerT>
class GenericCopyFromHostFunctor
{
private:
AccessorT src_acc_;
char *dst_ = nullptr;
dstTy *dst_ = nullptr;
IndexerT indexer_;

public:
GenericCopyFromHostFunctor(AccessorT src_acc,
char *dst_cp,
dstTy *dst_p,
IndexerT indexer)
: src_acc_(src_acc), dst_(dst_cp), indexer_(indexer)
: src_acc_(src_acc), dst_(dst_p), indexer_(indexer)
{
}

void operator()(sycl::id<1> wiid) const
{
auto offsets = indexer_(static_cast<py::ssize_t>(wiid.get(0)));
py::ssize_t src_offset = offsets.get_first_offset();
py::ssize_t dst_offset = offsets.get_second_offset();
const auto &offsets = indexer_(static_cast<py::ssize_t>(wiid.get(0)));
const py::ssize_t &src_offset = offsets.get_first_offset();
const py::ssize_t &dst_offset = offsets.get_second_offset();

CastFnT fn{};
fn(src_acc_, src_offset, dst_, dst_offset);
dst_[dst_offset] = fn(src_acc_[src_offset]);
}
};

Expand Down Expand Up @@ -447,13 +594,15 @@ void copy_and_cast_from_host_impl(
nd, src_offset - src_min_nelem_offset, dst_offset,
const_cast<const py::ssize_t *>(shape_and_strides)};

dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);

cgh.parallel_for<copy_cast_from_host_kernel<srcTy, dstTy,
TwoOffsets_StridedIndexer>>(
sycl::range<1>(nelems),
GenericCopyFromHostFunctor<
CasterForAccessor<srcTy, dstTy, decltype(npy_acc)>,
decltype(npy_acc), TwoOffsets_StridedIndexer>(npy_acc, dst_p,
indexer));
GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
Caster<srcTy, dstTy>,
TwoOffsets_StridedIndexer>(
npy_acc, dst_tp, indexer));
});

// perform explicit synchronization. Implicit synchronization would be
Expand Down
Loading