-
Notifications
You must be signed in to change notification settings - Fork 50
Initial phase of integrating new BL:AS 2 subroutines from merge_reduction… #61
Changes from 1 commit
c0073dc
2243be3
51dfd12
2ce2e0d
b634eb5
36e5fd9
171f65a
32bf6f4
0eb80bd
b688383
c2d8691
0617da1
b2b9e8f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,9 +103,9 @@ typename Executor::Return_Type _dot(Executor &ex, IndexType _N, ContainerT0 _vx, | |
auto rs = make_vector_view(ex, _rs, static_cast<IncrementType>(1), | ||
static_cast<IndexType>(1)); | ||
auto prdOp = make_op<BinaryOp, prdOp2_struct>(vx, vy); | ||
// TODO: (Mehdi) read them from the device | ||
auto localSize = 256; | ||
auto nWG = 512; | ||
|
||
auto localSize = ex.policy_handler().get_work_group_size(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. const auto localSize? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
auto nWG = 2 * localSize; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. const auto nWG? Also, why 2? can you ellaborate? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed. @josealiaga could you elaborate why this is fixed for 2? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please, first, read the attached document: SYCL_BLAS1_reduction.txt If you analyze my original code, you will see that two alternatives are considered, related to the alternatives A and B in the attached document. In the merged code, only the case B appears, whereas the case A is the best one for OpenCL devices that I tested. |
||
auto assignOp = | ||
make_addAssignReduction(rs, prdOp, localSize, localSize * nWG); | ||
auto ret = ex.reduce(assignOp); | ||
|
@@ -127,8 +127,8 @@ typename Executor::Return_Type _asum(Executor &ex, IndexType _N, | |
auto rs = make_vector_view(ex, _rs, static_cast<IncrementType>(1), | ||
static_cast<IndexType>(1)); | ||
// TODO: (Mehdi) read them from the device | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The TODO does not apply anymore |
||
auto localSize = 256; | ||
auto nWG = 512; | ||
auto localSize = ex.policy_handler().get_work_group_size(); | ||
auto nWG = 2 * localSize; | ||
auto assignOp = | ||
make_addAbsAssignReduction(rs, vx, localSize, localSize * nWG); | ||
auto ret = ex.reduce(assignOp); | ||
|
@@ -149,7 +149,8 @@ typename Executor::Return_Type _iamax(Executor &ex, IndexType _N, | |
auto rs = make_vector_view(ex, _rs, static_cast<IncrementType>(1), | ||
static_cast<IndexType>(1)); | ||
// TODO: (Mehdi) take this value from device | ||
size_t localSize = 256, nWG = 512; | ||
auto localSize = ex.policy_handler().get_work_group_size(); | ||
auto nWG = 2 * localSize; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove TODOs and apply constants |
||
auto tupOp = make_tuple_op(vx); | ||
auto assignOp = | ||
make_maxIndAssignReduction(rs, tupOp, localSize, localSize * nWG); | ||
|
@@ -172,7 +173,8 @@ typename Executor::Return_Type _iamin(Executor &ex, IndexType _N, | |
static_cast<IndexType>(1)); | ||
|
||
// TODO: (Mehdi) read them from the device | ||
size_t localSize = 256, nWG = 512; | ||
auto localSize = ex.policy_handler().get_work_group_size(); | ||
auto nWG = 2 * localSize; | ||
auto tupOp = make_tuple_op(vx); | ||
auto assignOp = | ||
make_minIndAssignReduction(rs, tupOp, localSize, localSize * nWG); | ||
|
@@ -234,9 +236,9 @@ typename Executor::Return_Type _nrm2(Executor &ex, IndexType _N, | |
auto rs = make_vector_view(ex, _rs, static_cast<IncrementType>(1), | ||
static_cast<IndexType>(1)); | ||
auto prdOp = make_op<UnaryOp, prdOp1_struct>(vx); | ||
// TODO: (Mehdi) read them from the deivce | ||
auto localSize = 256; | ||
auto nWG = 512; | ||
|
||
auto localSize = ex.policy_handler().get_work_group_size(); | ||
auto nWG = 2 * localSize; | ||
auto assignOp = | ||
make_addAssignReduction(rs, prdOp, localSize, localSize * nWG); | ||
ex.reduce(assignOp); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,9 +66,9 @@ typename Executor::Return_Type _gemv_impl( | |
auto vy = make_vector_view(ex, _vy, _incy, M); | ||
|
||
const IndexType interLoop = 1; | ||
const IndexType localSize = | ||
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size() | ||
: _localSize; | ||
const IndexType localSize = (_localSize == 0) | ||
? ex.policy_handler().get_work_group_size() | ||
: _localSize; | ||
const IndexType n_rows_WG = (_n_rows_WG == 0) | ||
? ((mA.getAccess()) ? 1 : localSize) | ||
: std::min(M, _n_rows_WG); | ||
|
@@ -135,9 +135,9 @@ typename Executor::Return_Type _trmv_impl( | |
auto vx = make_vector_view(ex, _vx, _incx, N); | ||
|
||
const IndexType interLoop = 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you comment on what interLoop is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @josealiaga could you please provide some documentation on the above issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. interLoop was introduced in the document SYCL_BLAS1_reduction.txt |
||
const IndexType localSize = | ||
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size() | ||
: _localSize; | ||
const IndexType localSize = (_localSize == 0) | ||
? ex.policy_handler().get_work_group_size() | ||
: _localSize; | ||
const IndexType n_rows_WG = (_n_rows_WG == 0) | ||
? ((mA.getAccess()) ? 1 : localSize) | ||
: std::min(N, _n_rows_WG); | ||
|
@@ -250,9 +250,9 @@ typename Executor::Return_Type _symv_impl( | |
|
||
const IndexType interLoop = 1; | ||
|
||
const IndexType localSize = | ||
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size() | ||
: _localSize; | ||
const IndexType localSize = (_localSize == 0) | ||
? ex.policy_handler().get_work_group_size() | ||
: _localSize; | ||
const IndexType shrMemSize = (_localSize == 0) ? localSize : _shrMemSize; | ||
|
||
const IndexType n_rows_WG_R = (_n_rows_WG == 0) ? 1 : std::min(N, _n_rows_WG); | ||
|
@@ -342,9 +342,9 @@ typename Executor::Return_Type _ger_impl( | |
auto vx = make_vector_view(ex, _vx, _incx, M); | ||
auto vy = make_vector_view(ex, _vy, _incy, N); | ||
|
||
const IndexType localSize = | ||
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size() | ||
: _localSize; | ||
const IndexType localSize = (_localSize == 0) | ||
? ex.policy_handler().get_work_group_size() | ||
: _localSize; | ||
const IndexType n_rows_WG = (_n_rows_WG == 0) | ||
? ((mA.getAccess()) ? 1 : localSize) | ||
: std::min(M, _n_rows_WG); | ||
|
@@ -400,9 +400,9 @@ typename Executor::Return_Type _syr_impl( | |
auto mA = make_matrix_view(ex, _mA, N, N, _lda, accessOpr); | ||
auto vx = make_vector_view(ex, _vx, _incx, N); | ||
|
||
const IndexType localSize = | ||
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size() | ||
: _localSize; | ||
const IndexType localSize = (_localSize == 0) | ||
? ex.policy_handler().get_work_group_size() | ||
: _localSize; | ||
const IndexType n_rows_WG = (_n_rows_WG == 0) | ||
? ((mA.getAccess()) ? 1 : localSize) | ||
: std::min(N, _n_rows_WG); | ||
|
@@ -470,9 +470,9 @@ typename Executor::Return_Type _syr2_impl( | |
auto vx = make_vector_view(ex, _vx, _incx, _N); | ||
auto vy = make_vector_view(ex, _vy, _incy, _N); | ||
|
||
const IndexType localSize = | ||
(_localSize == 0) ? ex.get_rounded_power_of_two_work_group_size() | ||
: _localSize; | ||
const IndexType localSize = (_localSize == 0) | ||
? ex.policy_handler().get_work_group_size() | ||
: _localSize; | ||
const IndexType n_rows_WG = (_n_rows_WG == 0) | ||
? ((mA.getAccess()) ? 1 : localSize) | ||
: std::min(N, _n_rows_WG); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do you return a reference to the policy?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because the queue interface contains the pointer mapper class which deals with allocation and deallocation of buffers and memory. The call to
implicitly-deleted copy constructor of 'Queue_Interface' error will rise in case of not returning the reference
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here is the error :
'Queue_Interface' is implicitly deleted because field 'pointer_mapper' has a deleted copy
constructor
mutable cl::sycl::codeplay::PointerMapper pointer_mapper;
^
../../../sycl-blas/include/queue/pointer_mapper.hpp:310:3: note: 'PointerMapper' has been
explicitly marked deleted here
PointerMapper(const PointerMapper&) = delete;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
used std::shared_ptr to remove the reference