Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Initial phase of integrating new BL:AS 2 subroutines from merge_reduction… #61

Merged
merged 13 commits into from
Aug 17, 2018
Merged
Prev Previous commit
Next Next commit
set the local size to be taken from device.
  • Loading branch information
mehdi-goli committed Aug 10, 2018
commit 171f65ae4709adbe95d0d872fbb76948dd076df6
6 changes: 3 additions & 3 deletions include/executors/executor_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class Executor {
inline T* allocate(size_t num_bytes);
template <typename T>
inline void* deallocate(T* p);
inline void* device();
inline Queue_Interface<ExecutionPolicy> policy_handler();
};

/*! Executor<Sequential>.
Expand All @@ -67,7 +67,7 @@ class Executor<Sequential> {
}
};

inline Queue_Interface<Sequential> queue() { return q_interface; }
inline Queue_Interface<Sequential> policy_handler() { return q_interface; }
};

/*! Executor<Parallel>.
Expand All @@ -87,8 +87,8 @@ class Executor<Parallel> {
t.eval(i);
}
};
inline Queue_Interface<Parallel> policy_handler() { return q_interface; }
};
inline Queue_Interface<Parallel> queue() { return q_interface; }

} // namespace blas

Expand Down
7 changes: 3 additions & 4 deletions include/executors/executor_sycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ class Executor<SYCL> {
*/
Executor(cl::sycl::queue q) : q_interface(q){};

inline Queue_Interface<SYCL> &policy_handler() { return q_interface; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you return a reference to the policy?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the queue interface contains the pointer mapper class which deals with allocation and deallocation of buffers and memory. The call to
implicitly-deleted copy constructor of 'Queue_Interface' error will rise in case of not returning the reference

Copy link
Collaborator Author

@mehdi-goli mehdi-goli Aug 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here is the error :
'Queue_Interface' is implicitly deleted because field 'pointer_mapper' has a deleted copy
constructor
mutable cl::sycl::codeplay::PointerMapper pointer_mapper;
^
../../../sycl-blas/include/queue/pointer_mapper.hpp:310:3: note: 'PointerMapper' has been
explicitly marked deleted here
PointerMapper(const PointerMapper&) = delete;

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

used std::shared_ptr to remove the reference


cl::sycl::queue queue() const { return q_interface.queue(); }

inline Queue_Interface<SYCL>::device_type get_device_type() {
Expand Down Expand Up @@ -370,7 +372,7 @@ class Executor<SYCL> {
*/
template <typename Tree>
inline cl::sycl::event execute(Tree t) {
const auto localSize = 128;
const auto localSize = policy_handler().get_work_group_size();
auto _N = t.getSize();
auto nWG = (_N + localSize - 1) / localSize;
auto globalSize = nWG * localSize;
Expand All @@ -379,9 +381,6 @@ class Executor<SYCL> {
localSize, globalSize, 0);
};

inline size_t get_work_group_size() const {
return q_interface.get_work_group_size();
}
/*!
* @brief Executes the tree fixing the localSize but without defining
* required shared memory.
Expand Down
22 changes: 12 additions & 10 deletions include/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ typename Executor::Return_Type _dot(Executor &ex, IndexType _N, ContainerT0 _vx,
auto rs = make_vector_view(ex, _rs, static_cast<IncrementType>(1),
static_cast<IndexType>(1));
auto prdOp = make_op<BinaryOp, prdOp2_struct>(vx, vy);
// TODO: (Mehdi) read them from the device
auto localSize = 256;
auto nWG = 512;

auto localSize = ex.policy_handler().get_work_group_size();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const auto localSize?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

auto nWG = 2 * localSize;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const auto nWG? Also, why 2? can you ellaborate?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed. @josealiaga could you elaborate why this is fixed for 2?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please, first, read the attached document: SYCL_BLAS1_reduction.txt

If you analyze my original code, you will see that two alternatives are considered, related to the alternatives A and B in the attached document. In the merged code, only the case B appears, whereas the case A is the best one for OpenCL devices that I tested.

auto assignOp =
make_addAssignReduction(rs, prdOp, localSize, localSize * nWG);
auto ret = ex.reduce(assignOp);
Expand All @@ -127,8 +127,8 @@ typename Executor::Return_Type _asum(Executor &ex, IndexType _N,
auto rs = make_vector_view(ex, _rs, static_cast<IncrementType>(1),
static_cast<IndexType>(1));
// TODO: (Mehdi) read them from the device
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The TODO does not apply anymore

auto localSize = 256;
auto nWG = 512;
auto localSize = ex.policy_handler().get_work_group_size();
auto nWG = 2 * localSize;
auto assignOp =
make_addAbsAssignReduction(rs, vx, localSize, localSize * nWG);
auto ret = ex.reduce(assignOp);
Expand All @@ -149,7 +149,8 @@ typename Executor::Return_Type _iamax(Executor &ex, IndexType _N,
auto rs = make_vector_view(ex, _rs, static_cast<IncrementType>(1),
static_cast<IndexType>(1));
// TODO: (Mehdi) take this value from device
size_t localSize = 256, nWG = 512;
auto localSize = ex.policy_handler().get_work_group_size();
auto nWG = 2 * localSize;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove TODOs and apply constants

auto tupOp = make_tuple_op(vx);
auto assignOp =
make_maxIndAssignReduction(rs, tupOp, localSize, localSize * nWG);
Expand All @@ -172,7 +173,8 @@ typename Executor::Return_Type _iamin(Executor &ex, IndexType _N,
static_cast<IndexType>(1));

// TODO: (Mehdi) read them from the device
size_t localSize = 256, nWG = 512;
auto localSize = ex.policy_handler().get_work_group_size();
auto nWG = 2 * localSize;
auto tupOp = make_tuple_op(vx);
auto assignOp =
make_minIndAssignReduction(rs, tupOp, localSize, localSize * nWG);
Expand Down Expand Up @@ -234,9 +236,9 @@ typename Executor::Return_Type _nrm2(Executor &ex, IndexType _N,
auto rs = make_vector_view(ex, _rs, static_cast<IncrementType>(1),
static_cast<IndexType>(1));
auto prdOp = make_op<UnaryOp, prdOp1_struct>(vx);
// TODO: (Mehdi) read them from the deivce
auto localSize = 256;
auto nWG = 512;

auto localSize = ex.policy_handler().get_work_group_size();
auto nWG = 2 * localSize;
auto assignOp =
make_addAssignReduction(rs, prdOp, localSize, localSize * nWG);
ex.reduce(assignOp);
Expand Down
36 changes: 18 additions & 18 deletions include/interface/blas2_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ typename Executor::Return_Type _gemv_impl(
auto vy = make_vector_view(ex, _vy, _incy, M);

const IndexType interLoop = 1;
const IndexType localSize =
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size()
: _localSize;
const IndexType localSize = (_localSize == 0)
? ex.policy_handler().get_work_group_size()
: _localSize;
const IndexType n_rows_WG = (_n_rows_WG == 0)
? ((mA.getAccess()) ? 1 : localSize)
: std::min(M, _n_rows_WG);
Expand Down Expand Up @@ -135,9 +135,9 @@ typename Executor::Return_Type _trmv_impl(
auto vx = make_vector_view(ex, _vx, _incx, N);

const IndexType interLoop = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you comment on what interLoop is

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@josealiaga could you please provide some documentation on the above issue?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interLoop was introduced in the document SYCL_BLAS1_reduction.txt

const IndexType localSize =
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size()
: _localSize;
const IndexType localSize = (_localSize == 0)
? ex.policy_handler().get_work_group_size()
: _localSize;
const IndexType n_rows_WG = (_n_rows_WG == 0)
? ((mA.getAccess()) ? 1 : localSize)
: std::min(N, _n_rows_WG);
Expand Down Expand Up @@ -250,9 +250,9 @@ typename Executor::Return_Type _symv_impl(

const IndexType interLoop = 1;

const IndexType localSize =
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size()
: _localSize;
const IndexType localSize = (_localSize == 0)
? ex.policy_handler().get_work_group_size()
: _localSize;
const IndexType shrMemSize = (_localSize == 0) ? localSize : _shrMemSize;

const IndexType n_rows_WG_R = (_n_rows_WG == 0) ? 1 : std::min(N, _n_rows_WG);
Expand Down Expand Up @@ -342,9 +342,9 @@ typename Executor::Return_Type _ger_impl(
auto vx = make_vector_view(ex, _vx, _incx, M);
auto vy = make_vector_view(ex, _vy, _incy, N);

const IndexType localSize =
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size()
: _localSize;
const IndexType localSize = (_localSize == 0)
? ex.policy_handler().get_work_group_size()
: _localSize;
const IndexType n_rows_WG = (_n_rows_WG == 0)
? ((mA.getAccess()) ? 1 : localSize)
: std::min(M, _n_rows_WG);
Expand Down Expand Up @@ -400,9 +400,9 @@ typename Executor::Return_Type _syr_impl(
auto mA = make_matrix_view(ex, _mA, N, N, _lda, accessOpr);
auto vx = make_vector_view(ex, _vx, _incx, N);

const IndexType localSize =
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size()
: _localSize;
const IndexType localSize = (_localSize == 0)
? ex.policy_handler().get_work_group_size()
: _localSize;
const IndexType n_rows_WG = (_n_rows_WG == 0)
? ((mA.getAccess()) ? 1 : localSize)
: std::min(N, _n_rows_WG);
Expand Down Expand Up @@ -470,9 +470,9 @@ typename Executor::Return_Type _syr2_impl(
auto vx = make_vector_view(ex, _vx, _incx, _N);
auto vy = make_vector_view(ex, _vy, _incy, _N);

const IndexType localSize =
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size()
: _localSize;
const IndexType localSize = (_localSize == 0)
? ex.policy_handler().get_work_group_size()
: _localSize;
const IndexType n_rows_WG = (_n_rows_WG == 0)
? ((mA.getAccess()) ? 1 : localSize)
: std::min(N, _n_rows_WG);
Expand Down
32 changes: 31 additions & 1 deletion include/queue/queue_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,36 @@ class Queue_Interface<Sequential> {
}; // namespace blastemplate<classExecutionPolicy>classQueue_Interface

template <>
class Queue_Interface<Parallel> : Queue_Interface<Sequential> {};
class Queue_Interface<Parallel> {
Queue_Interface() {}
/*
@brief This class is to determine whether or not the underlying device has
dedicated shared memory
*/
inline bool has_local_memory() { return false; }
/*
@brief This class is used to allocated the a regin of memory on the device
@tparam T the type of the pointer
@param num_elements number of elements of the buffer
*/
template <typename T>
inline T *allocate(size_t num_elements) const {
return std::malloc(num_elements * sizeof(T));
}
/*
@brief this class is to deallocate the provided region of memory on the device
@tparam T the type of the pointer
@param p the pointer to be deleted
*/
template <typename T>
inline void deallocate(T *p) const {
std::free(p);
}

// This function returns Work-group size which is equal to maximum device
// workgroup size.
inline size_t get_work_group_size() { return size_t(256); }
}; // namespace blastemplate<classExecutionPolicy>classQueue_Interface

} // namespace blas
#endif