Skip to content

Improvement to performance of tensor.sum #1303

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 72 additions & 59 deletions dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,12 @@ template <typename T> struct boolean_predicate
}
};

template <typename inpT,
typename outT,
typename PredicateT,
std::uint8_t wg_dim = 2>
template <typename inpT, typename outT, typename PredicateT>
struct all_reduce_wg_contig
{
void operator()(sycl::nd_item<wg_dim> &ndit,
void operator()(sycl::nd_item<1> &ndit,
outT *out,
size_t &out_idx,
const size_t &out_idx,
const inpT *start,
const inpT *end) const
{
Expand All @@ -82,15 +79,12 @@ struct all_reduce_wg_contig
}
};

template <typename inpT,
typename outT,
typename PredicateT,
std::uint8_t wg_dim = 2>
template <typename inpT, typename outT, typename PredicateT>
struct any_reduce_wg_contig
{
void operator()(sycl::nd_item<wg_dim> &ndit,
void operator()(sycl::nd_item<1> &ndit,
outT *out,
size_t &out_idx,
const size_t &out_idx,
const inpT *start,
const inpT *end) const
{
Expand All @@ -109,9 +103,9 @@ struct any_reduce_wg_contig
}
};

template <typename T, std::uint8_t wg_dim = 2> struct all_reduce_wg_strided
template <typename T> struct all_reduce_wg_strided
{
void operator()(sycl::nd_item<wg_dim> &ndit,
void operator()(sycl::nd_item<1> &ndit,
T *out,
const size_t &out_idx,
const T &local_val) const
Expand All @@ -129,9 +123,9 @@ template <typename T, std::uint8_t wg_dim = 2> struct all_reduce_wg_strided
}
};

template <typename T, std::uint8_t wg_dim = 2> struct any_reduce_wg_strided
template <typename T> struct any_reduce_wg_strided
{
void operator()(sycl::nd_item<wg_dim> &ndit,
void operator()(sycl::nd_item<1> &ndit,
T *out,
const size_t &out_idx,
const T &local_val) const
Expand Down Expand Up @@ -215,35 +209,46 @@ struct ContigBooleanReduction
outT *out_ = nullptr;
GroupOp group_op_;
size_t reduction_max_gid_ = 0;
size_t iter_gws_ = 1;
size_t reductions_per_wi = 16;

public:
ContigBooleanReduction(const argT *inp,
outT *res,
GroupOp group_op,
size_t reduction_size,
size_t iteration_size,
size_t reduction_size_per_wi)
: inp_(inp), out_(res), group_op_(group_op),
reduction_max_gid_(reduction_size),
reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
reductions_per_wi(reduction_size_per_wi)
{
}

void operator()(sycl::nd_item<2> it) const
void operator()(sycl::nd_item<1> it) const
{

size_t reduction_id = it.get_group(0);
size_t reduction_batch_id = it.get_group(1);
size_t wg_size = it.get_local_range(1);

size_t base = reduction_id * reduction_max_gid_;
size_t start = base + reduction_batch_id * wg_size * reductions_per_wi;
size_t end = std::min((start + (reductions_per_wi * wg_size)),
base + reduction_max_gid_);
const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
const size_t reduction_id = it.get_global_id(0) / red_gws_;
const size_t reduction_batch_id = get_reduction_batch_id(it);
const size_t wg_size = it.get_local_range(0);

const size_t base = reduction_id * reduction_max_gid_;
const size_t start =
base + reduction_batch_id * wg_size * reductions_per_wi;
const size_t end = std::min((start + (reductions_per_wi * wg_size)),
base + reduction_max_gid_);
// reduction and atomic operations are performed
// in group_op_
group_op_(it, out_, reduction_id, inp_ + start, inp_ + end);
}

private:
size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
{
const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
return reduction_batch_id;
}
};

typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)(
Expand Down Expand Up @@ -332,7 +337,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
red_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(init_ev);

constexpr std::uint8_t group_dim = 2;
constexpr std::uint8_t dim = 1;

constexpr size_t preferred_reductions_per_wi = 4;
size_t reductions_per_wi =
Expand All @@ -344,15 +349,14 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
(reduction_nelems + reductions_per_wi * wg - 1) /
(reductions_per_wi * wg);

auto gws =
sycl::range<group_dim>{iter_nelems, reduction_groups * wg};
auto lws = sycl::range<group_dim>{1, wg};
auto gws = sycl::range<dim>{iter_nelems * reduction_groups * wg};
auto lws = sycl::range<dim>{wg};

cgh.parallel_for<
class boolean_reduction_contig_krn<argTy, resTy, GroupOpT>>(
sycl::nd_range<group_dim>(gws, lws),
sycl::nd_range<dim>(gws, lws),
ContigBooleanReduction<argTy, resTy, GroupOpT>(
arg_tp, res_tp, GroupOpT(), reduction_nelems,
arg_tp, res_tp, GroupOpT(), reduction_nelems, iter_nelems,
reductions_per_wi));
});
}
Expand Down Expand Up @@ -404,6 +408,7 @@ struct StridedBooleanReduction
InputOutputIterIndexerT inp_out_iter_indexer_;
InputRedIndexerT inp_reduced_dims_indexer_;
size_t reduction_max_gid_ = 0;
size_t iter_gws_ = 1;
size_t reductions_per_wi = 16;

public:
Expand All @@ -415,23 +420,24 @@ struct StridedBooleanReduction
InputOutputIterIndexerT arg_res_iter_indexer,
InputRedIndexerT arg_reduced_dims_indexer,
size_t reduction_size,
size_t iteration_size,
size_t reduction_size_per_wi)
: inp_(inp), out_(res), reduction_op_(reduction_op),
group_op_(group_op), identity_(identity_val),
inp_out_iter_indexer_(arg_res_iter_indexer),
inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
reduction_max_gid_(reduction_size),
reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
reductions_per_wi(reduction_size_per_wi)
{
}

void operator()(sycl::nd_item<2> it) const
void operator()(sycl::nd_item<1> it) const
{

size_t reduction_id = it.get_group(0);
size_t reduction_batch_id = it.get_group(1);
size_t reduction_lid = it.get_local_id(1);
size_t wg_size = it.get_local_range(1);
const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
const size_t reduction_id = it.get_global_id(0) / red_gws_;
const size_t reduction_batch_id = get_reduction_batch_id(it);
const size_t reduction_lid = it.get_local_id(0);
const size_t wg_size = it.get_local_range(0);

auto inp_out_iter_offsets_ = inp_out_iter_indexer_(reduction_id);
const py::ssize_t &inp_iter_offset =
Expand All @@ -442,26 +448,34 @@ struct StridedBooleanReduction
outT local_red_val(identity_);
size_t arg_reduce_gid0 =
reduction_lid + reduction_batch_id * wg_size * reductions_per_wi;
for (size_t m = 0; m < reductions_per_wi; ++m) {
size_t arg_reduce_gid = arg_reduce_gid0 + m * wg_size;

if (arg_reduce_gid < reduction_max_gid_) {
py::ssize_t inp_reduction_offset = static_cast<py::ssize_t>(
inp_reduced_dims_indexer_(arg_reduce_gid));
py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
size_t arg_reduce_gid_max = std::min(
reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg_size);
for (size_t arg_reduce_gid = arg_reduce_gid0;
arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg_size)
{
py::ssize_t inp_reduction_offset = static_cast<py::ssize_t>(
inp_reduced_dims_indexer_(arg_reduce_gid));
py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;

// must convert to boolean first to handle nans
using dpctl::tensor::type_utils::convert_impl;
bool val = convert_impl<bool, argT>(inp_[inp_offset]);
ReductionOp op = reduction_op_;
// must convert to boolean first to handle nans
using dpctl::tensor::type_utils::convert_impl;
bool val = convert_impl<bool, argT>(inp_[inp_offset]);
ReductionOp op = reduction_op_;

local_red_val = op(local_red_val, static_cast<outT>(val));
}
local_red_val = op(local_red_val, static_cast<outT>(val));
}
// reduction and atomic operations are performed
// in group_op_
group_op_(it, out_, out_iter_offset, local_red_val);
}

private:
size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
{
const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
return reduction_batch_id;
}
};

template <typename T1,
Expand Down Expand Up @@ -564,7 +578,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
red_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(res_init_ev);

constexpr std::uint8_t group_dim = 2;
constexpr std::uint8_t dim = 1;

using InputOutputIterIndexerT =
dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
Expand All @@ -587,20 +601,19 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
(reduction_nelems + reductions_per_wi * wg - 1) /
(reductions_per_wi * wg);

auto gws =
sycl::range<group_dim>{iter_nelems, reduction_groups * wg};
auto lws = sycl::range<group_dim>{1, wg};
auto gws = sycl::range<dim>{iter_nelems * reduction_groups * wg};
auto lws = sycl::range<dim>{wg};

cgh.parallel_for<class boolean_reduction_strided_krn<
argTy, resTy, RedOpT, GroupOpT, InputOutputIterIndexerT,
ReductionIndexerT>>(
sycl::nd_range<group_dim>(gws, lws),
sycl::nd_range<dim>(gws, lws),
StridedBooleanReduction<argTy, resTy, RedOpT, GroupOpT,
InputOutputIterIndexerT,
ReductionIndexerT>(
arg_tp, res_tp, RedOpT(), GroupOpT(), identity_val,
in_out_iter_indexer, reduction_indexer, reduction_nelems,
reductions_per_wi));
iter_nelems, reductions_per_wi));
});
}
return red_ev;
Expand Down
Loading