Skip to content

Feature/multiply and subtract #1211

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dpctl/tensor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@
isfinite,
isinf,
isnan,
multiply,
sqrt,
subtract,
)

__all__ = [
Expand Down Expand Up @@ -186,5 +188,7 @@
"isfinite",
"sqrt",
"divide",
"multiply",
"subtract",
"equal",
]
57 changes: 51 additions & 6 deletions dpctl/tensor/_elementwise_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# B01: ===== ADD (x1, x2)

_add_docstring_ = """
add(x1, x2, order='K')
add(x1, x2, out=None, order='K')

Calculates the sum for each element `x1_i` of the input array `x1` with
the respective element `x2_i` of the input array `x2`.
Expand Down Expand Up @@ -94,7 +94,7 @@

# U11: ==== COS (x)
_cos_docstring = """
cos(x, order='K')
cos(x, out=None, order='K')

Computes cosine for each element `x_i` for input array `x`.
"""
Expand All @@ -106,7 +106,7 @@

# B08: ==== DIVIDE (x1, x2)
_divide_docstring_ = """
divide(x1, x2, order='K')
divide(x1, x2, out=None, order='K')

Calculates the ratio for each element `x1_i` of the input array `x1` with
the respective element `x2_i` of the input array `x2`.
Expand All @@ -128,7 +128,7 @@

# B09: ==== EQUAL (x1, x2)
_equal_docstring_ = """
equal(x1, x2, order='K')
equal(x1, x2, out=None, order='K')

Calculates equality test results for each element `x1_i` of the input array `x1`
with the respective element `x2_i` of the input array `x2`.
Expand Down Expand Up @@ -172,6 +172,8 @@

# U17: ==== ISFINITE (x)
_isfinite_docstring_ = """
isfinite(x, out=None, order='K')

Computes if every element of input array is a finite number.
"""

Expand All @@ -181,6 +183,8 @@

# U18: ==== ISINF (x)
_isinf_docstring_ = """
isinf(x, out=None, order='K')

Computes if every element of input array is an infinity.
"""

Expand All @@ -190,6 +194,8 @@

# U19: ==== ISNAN (x)
_isnan_docstring_ = """
isnan(x, out=None, order='K')

Computes if every element of input array is a NaN.
"""

Expand Down Expand Up @@ -231,7 +237,25 @@
# FIXME: implement B18

# B19: ==== MULTIPLY (x1, x2)
# FIXME: implement B19
_multiply_docstring_ = """
multiply(x1, x2, out=None, order='K')

Calculates the product for each element `x1_i` of the input array `x1`
with the respective element `x2_i` of the input array `x2`.

Args:
x1 (usm_ndarray):
First input array, expected to have numeric data type.
x2 (usm_ndarray):
Second input array, also expected to have numeric data type.
Returns:
usm_narray:
an array containing the element-wise products. The data type of
the returned array is determined by the Type Promotion Rules.
"""
multiply = BinaryElementwiseFunc(
"multiply", ti._multiply_result_type, ti._multiply, _multiply_docstring_
)

# U25: ==== NEGATIVE (x)
# FIXME: implement U25
Expand Down Expand Up @@ -268,6 +292,8 @@

# U33: ==== SQRT (x)
_sqrt_docstring_ = """
sqrt(x, out=None, order='K')

Computes sqrt for each element `x_i` for input array `x`.
"""

Expand All @@ -276,7 +302,26 @@
)

# B23: ==== SUBTRACT (x1, x2)
# FIXME: implement B23
_subtract_docstring_ = """
subtract(x1, x2, out=None, order='K')

Calculates the difference bewteen each element `x1_i` of the input
array `x1` and the respective element `x2_i` of the input array `x2`.

Args:
x1 (usm_ndarray):
First input array, expected to have numeric data type.
x2 (usm_ndarray):
Second input array, also expected to have numeric data type.
Returns:
usm_narray:
an array containing the element-wise differences. The data type
of the returned array is determined by the Type Promotion Rules.
"""
subtract = BinaryElementwiseFunc(
"subtract", ti._subtract_result_type, ti._subtract, _subtract_docstring_
)


# U34: ==== TAN (x)
# FIXME: implement U34
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,27 +115,9 @@ sycl::event abs_contig_impl(sycl::queue exec_q,
char *res_p,
const std::vector<sycl::event> &depends = {})
{
sycl::event abs_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

size_t lws = 64;
constexpr unsigned int vec_sz = 4;
constexpr unsigned int n_vecs = 2;
const size_t n_groups =
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

using resTy = typename AbsOutputType<argTy>::value_type;
const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
resTy *res_tp = reinterpret_cast<resTy *>(res_p);

cgh.parallel_for<abs_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
sycl::nd_range<1>(gws_range, lws_range),
AbsContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
nelems));
});
return abs_ev;
return elementwise_common::unary_contig_impl<
argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel>(
exec_q, nelems, arg_p, res_p, depends);
}

template <typename fnT, typename T> struct AbsContigFactory
Expand Down Expand Up @@ -182,24 +164,10 @@ sycl::event abs_strided_impl(sycl::queue exec_q,
const std::vector<sycl::event> &depends,
const std::vector<sycl::event> &additional_depends)
{
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
cgh.depends_on(additional_depends);

using resTy = typename AbsOutputType<argTy>::value_type;
using IndexerT =
typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;

IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};

const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
resTy *res_tp = reinterpret_cast<resTy *>(res_p);

cgh.parallel_for<abs_strided_kernel<argTy, resTy, IndexerT>>(
{nelems},
AbsStridedFunctor<argTy, resTy, IndexerT>(arg_tp, res_tp, indexer));
});
return comp_ev;
return elementwise_common::unary_strided_impl<
argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>(
exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
res_offset, depends, additional_depends);
}

template <typename fnT, typename T> struct AbsStridedFactory
Expand Down
118 changes: 14 additions & 104 deletions dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,32 +184,10 @@ sycl::event add_contig_impl(sycl::queue exec_q,
py::ssize_t res_offset,
const std::vector<sycl::event> &depends = {})
{
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

size_t lws = 64;
constexpr unsigned int vec_sz = 4;
constexpr unsigned int n_vecs = 2;
const size_t n_groups =
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

using resTy = typename AddOutputType<argTy1, argTy2>::value_type;

const argTy1 *arg1_tp =
reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
const argTy2 *arg2_tp =
reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;

cgh.parallel_for<
add_contig_kernel<argTy1, argTy2, resTy, vec_sz, n_vecs>>(
sycl::nd_range<1>(gws_range, lws_range),
AddContigFunctor<argTy1, argTy2, resTy, vec_sz, n_vecs>(
arg1_tp, arg2_tp, res_tp, nelems));
});
return comp_ev;
return elementwise_common::binary_contig_impl<
argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel>(
exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
res_offset, depends);
}

template <typename fnT, typename T1, typename T2> struct AddContigFactory
Expand Down Expand Up @@ -256,28 +234,11 @@ sycl::event add_strided_impl(sycl::queue exec_q,
const std::vector<sycl::event> &depends,
const std::vector<sycl::event> &additional_depends)
{
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
cgh.depends_on(additional_depends);

using resTy = typename AddOutputType<argTy1, argTy2>::value_type;

using IndexerT =
typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;

IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
shape_and_strides};

const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
resTy *res_tp = reinterpret_cast<resTy *>(res_p);

cgh.parallel_for<
add_strided_strided_kernel<argTy1, argTy2, resTy, IndexerT>>(
{nelems}, AddStridedFunctor<argTy1, argTy2, resTy, IndexerT>(
arg1_tp, arg2_tp, res_tp, indexer));
});
return comp_ev;
return elementwise_common::binary_strided_impl<
argTy1, argTy2, AddOutputType, AddStridedFunctor,
add_strided_strided_kernel>(
exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
arg2_offset, res_p, res_offset, depends, additional_depends);
}

template <typename fnT, typename T1, typename T2> struct AddStridedFactory
Expand Down Expand Up @@ -322,62 +283,11 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl(
py::ssize_t res_offset,
const std::vector<sycl::event> &depends = {})
{
const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
resT *res = reinterpret_cast<resT *>(res_p) + res_offset;

const auto &dev = exec_q.get_device();
const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
// Get device-specific kernel info max_sub_group_size
size_t max_sgSize =
*(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));

size_t n1_padded = n1 + max_sgSize;
argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);

if (padded_vec == nullptr) {
throw std::runtime_error("Could not allocate memory on the device");
}
sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends); // ensure vec contains actual data
cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
auto i = id[0];
padded_vec[i] = vec[i % n1];
});
});

// sub-group spans work-items [I, I + sgSize)
// base = ndit.get_global_linear_id() - sg.get_local_id()[0]
// Generically, sg.load( &mat[base]) may load arrays from
// different rows of mat. The start corresponds to row (base / n0)
// We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
// ensure that reads are accessible

size_t lws = 64;

sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(make_padded_vec_ev);

auto lwsRange = sycl::range<1>(lws);
size_t n_elems = n0 * n1;
size_t n_groups = (n_elems + lws - 1) / lws;
auto gwsRange = sycl::range<1>(n_groups * lws);

cgh.parallel_for<
class add_matrix_row_broadcast_sg_krn<argT1, argT2, resT>>(
sycl::nd_range<1>(gwsRange, lwsRange),
AddContigMatrixContigRowBroadcastingFunctor<argT1, argT2, resT>(
mat, padded_vec, res, n_elems, n1));
});

sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(comp_ev);
sycl::context ctx = exec_q.get_context();
cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); });
});
host_tasks.push_back(tmp_cleanup_ev);

return comp_ev;
return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor,
add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
mat_offset, vec_p, vec_offset, res_p,
res_offset, depends);
}

template <typename fnT, typename T1, typename T2>
Expand Down
Loading