diff --git a/include/executors/executor_base.hpp b/include/executors/executor_base.hpp index 43ad786ca..e34283640 100644 --- a/include/executors/executor_base.hpp +++ b/include/executors/executor_base.hpp @@ -40,13 +40,15 @@ namespace blas { template class Executor { public: + using Queue_Interface_Type = Queue_Interface; + using Return_Type = void; template void execute(Tree t) = delete; template inline T* allocate(size_t num_bytes); template inline void* deallocate(T* p); - inline Queue_Interface get_policy_handler(); + inline Queue_Interface_Type get_policy_handler(); template void wait(first_event_t first_event, next_event_t... next_events); void wait(); @@ -58,8 +60,12 @@ class Executor { */ template <> class Executor { + public: + using Queue_Interface_Type = Queue_Interface; + using Return_Type = void; + private: - Queue_Interface q_interface; + Queue_Interface_Type q_interface; public: template @@ -70,9 +76,7 @@ class Executor { } }; - inline Queue_Interface get_policy_handler() { - return q_interface; - } + inline Queue_Interface_Type get_policy_handler() { return q_interface; } template void wait(first_event_t, next_event_t...) {} void wait() {} @@ -84,7 +88,12 @@ class Executor { */ template <> class Executor { - Queue_Interface q_interface; + public: + using Queue_Interface_Type = Queue_Interface; + using Return_Type = void; + + private: + Queue_Interface_Type q_interface; public: template @@ -95,7 +104,7 @@ class Executor { t.eval(i); } }; - inline Queue_Interface get_policy_handler() { return q_interface; } + inline Queue_Interface_Type get_policy_handler() { return q_interface; } template void wait(first_event_t, next_event_t...) {} void wait() {} diff --git a/include/executors/executor_sycl.hpp b/include/executors/executor_sycl.hpp index 1a51bf00d..c28037503 100644 --- a/include/executors/executor_sycl.hpp +++ b/include/executors/executor_sycl.hpp @@ -244,10 +244,14 @@ static cl::sycl::event execute_tree(cl::sycl::queue q_, Tree t, */ template <> class Executor { - Queue_Interface q_interface; - public: + using Queue_Interface_Type = Queue_Interface; using Return_Type = cl::sycl::event; + + private: + Queue_Interface_Type q_interface; + + public: template < typename T, cl::sycl::access::mode AcM = cl::sycl::access::mode::read_write, @@ -260,11 +264,11 @@ class Executor { */ Executor(cl::sycl::queue q) : q_interface(q){}; - inline Queue_Interface get_policy_handler() { return q_interface; } + inline Queue_Interface_Type get_policy_handler() { return q_interface; } cl::sycl::queue get_queue() const { return q_interface.get_queue(); } - inline Queue_Interface::device_type get_device_type() { + inline Queue_Interface_Type::device_type get_device_type() { return q_interface.get_device_type(); } diff --git a/include/interface/blas3_interface.hpp b/include/interface/blas3_interface.hpp index bbde83d95..6a56a6dd6 100644 --- a/include/interface/blas3_interface.hpp +++ b/include/interface/blas3_interface.hpp @@ -58,10 +58,14 @@ typename Executor::Return_Type _select_gemm( auto buffer_a = make_matrix_view(ex, _A, _M, _K, _lda, 0); auto buffer_b = make_matrix_view(ex, _B, _K, _N, _ldb, 0); auto buffer_c = make_matrix_view(ex, _C, _M, _N, _ldc, 0); - +#ifndef NAIVE_GEMM #define ENABLE_GEMM_TRANSPOSE(_trans_a, _trans_b) \ if (_TransA == _trans_a && _TransB == _trans_b) { \ - if (ex.has_local_memory()) { \ + if (ex.has_local_memory() && \ + (ex.get_device_type() != \ + Executor::Queue_Interface_Type::device_type::SYCL_RCAR_CVENGINE) && \ + (ex.get_device_type() != \ + Executor::Queue_Interface_Type::device_type::SYCL_RCAR_HOST_CPU)) { \ auto gemm = make_gemm(buffer_a, buffer_b, buffer_c, \ T(_alpha), T(_beta)); \ @@ -73,7 +77,15 @@ typename Executor::Return_Type _select_gemm( } \ return ret; \ } - +#else +#define ENABLE_GEMM_TRANSPOSE(_trans_a, _trans_b) \ + if (_TransA == _trans_a && _TransB == _trans_b) { \ + auto gemm = make_gemm_no_local_mem( \ + buffer_a, buffer_b, buffer_c, T(_alpha), T(_beta)); \ + ret = ex.gemm_executor(gemm); \ + return ret; \ + } +#endif const bool NoTrans = false; const bool Trans = true; @@ -92,12 +104,12 @@ typename Executor::Return_Type _select_gemm( * * See netlib.org/blas for details. */ -template -cl::sycl::event _gemm(Executor& ex, char _TransA, char _TransB, - IndexType _M, IndexType _N, IndexType _K, T _alpha, - ContainerT0 _A, IndexType _lda, ContainerT1 _B, - IndexType _ldb, T _beta, ContainerT2 _C, IndexType _ldc) { +cl::sycl::event _gemm(Executor& ex, char _TransA, char _TransB, IndexType _M, + IndexType _N, IndexType _K, T _alpha, ContainerT0 _A, + IndexType _lda, ContainerT1 _B, IndexType _ldb, T _beta, + ContainerT2 _C, IndexType _ldc) { _TransA = tolower(_TransA); _TransB = tolower(_TransB); @@ -121,10 +133,16 @@ cl::sycl::event _gemm(Executor& ex, char _TransA, char _TransB, _ldc); \ } #ifndef NAIVE_GEMM - if (ex.get_device_type() == Queue_Interface::device_type::INTELGPU) { + if (ex.get_device_type() == + Executor::Queue_Interface_Type::device_type::SYCL_INTEL_GPU) { BIND_DATA_SIZE(1024, 4096, 1024) TO_TPARAMS(128, false, 4, 4, 16, 16); BIND_DATA_SIZE(10, 1024, 1024) TO_TPARAMS(128, false, 2, 2, 8, 8); BIND_DEFAULT TO_TPARAMS(128, false, 8, 8, 8, 8); + } else if ((ex.get_device_type() == Executor::Queue_Interface_Type:: + device_type::SYCL_RCAR_CVENGINE) && + (ex.get_device_type() == Executor::Queue_Interface_Type:: + device_type::SYCL_RCAR_HOST_CPU)) { + BIND_DEFAULT TO_TPARAMS(32, false, 8, 8, 8, 8); } else { BIND_DATA_SIZE(10, 1024, 1024) TO_TPARAMS(128, true, 1, 1, 16, 16); BIND_DEFAULT TO_TPARAMS(128, false, 8, 8, 16, 16); diff --git a/include/queue/queue_sycl.hpp b/include/queue/queue_sycl.hpp index 44514b177..59569cbec 100644 --- a/include/queue/queue_sycl.hpp +++ b/include/queue/queue_sycl.hpp @@ -45,7 +45,15 @@ class Queue_Interface { using generic_buffer_data_type = cl::sycl::codeplay::buffer_data_type_t; public: - enum device_type { UNSUPPORTED_DEVICE, INTELGPU, AMDGPU }; + enum device_type { + SYCL_CPU, + SYCL_HOST, + SYCL_UNSUPPORTED_DEVICE, + SYCL_INTEL_GPU, + SYCL_AMD_GPU, + SYCL_RCAR_CVENGINE, + SYCL_RCAR_HOST_CPU + }; explicit Queue_Interface(cl::sycl::queue q) : q_(q), @@ -62,14 +70,24 @@ class Queue_Interface { auto platform = dev.get_platform(); auto plat_name = platform.template get_info(); + auto device_type = + dev.template get_info(); std::transform(plat_name.begin(), plat_name.end(), plat_name.begin(), ::tolower); - if (plat_name.find("amd") != std::string::npos && dev.is_gpu()) { - return AMDGPU; - } else if (plat_name.find("intel") != std::string::npos && dev.is_gpu()) { - return INTELGPU; + if (plat_name.find("amd") != std::string::npos && + device_type == cl::sycl::info::device_type::gpu) { + return SYCL_AMD_GPU; + } else if (plat_name.find("intel") != std::string::npos && + device_type == cl::sycl::info::device_type::gpu) { + return SYCL_INTEL_GPU; + } else if (plat_name.find("computeaorta") != std::string::npos && + device_type == cl::sycl::info::device_type::accelerator) { + return SYCL_RCAR_CVENGINE; + } else if (plat_name.find("computeaorta") != std::string::npos && + device_type == cl::sycl::info::device_type::cpu) { + return SYCL_RCAR_HOST_CPU; } else { - return UNSUPPORTED_DEVICE; + return SYCL_UNSUPPORTED_DEVICE; } throw std::runtime_error("couldn't find device"); }