Skip to content

Commit

Permalink
Target based gemm optimisation (codeplaysoftware#75)
Browse files Browse the repository at this point in the history
* Add Target option to enable/disable specific target optimisation at compile time.
* Optimising GEMM for RCAR targets.
* Adding SYCLBLAS Configuration to both benchmark and test
  • Loading branch information
mehdi-goli authored Sep 26, 2018
1 parent 1b2cd2b commit 2af4200
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 58 deletions.
34 changes: 1 addition & 33 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@ project(sycl-blas)

enable_testing()

message(STATUS "Path to the SYCL implementation ")

set(SYCLBLAS_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/include)

set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)

include(FindOpenCL)

include(FindComputeCpp)
include(ConfigureSYCLBLAS)

include(FindBLAS)

Expand All @@ -31,35 +28,6 @@ else()
message(STATUS "Found BLAS library at: ${BLAS_LIBRARIES}")
endif()

message(STATUS "${COMPUTECPP_DEVICE_COMPILER_FLAGS}")

# We add some flags to workaround OpenCL platform bugs, see ComputeCpp documentation
set(COMPUTECPP_DEVICE_COMPILER_FLAGS
"${COMPUTECPP_DEVICE_COMPILER_FLAGS} -no-serial-memop -sycl-compress-name")
message(STATUS "${COMPUTECPP_DEVICE_COMPILER_FLAGS}")

# Check to see if we've disabled double support in the tests
option(NO_DOUBLE_SUPPORT "Disable double support when testing." off)
if(NO_DOUBLE_SUPPORT)
# Define NO_DOUBLE_SUPPORT for the host cxx compiler
add_definitions(-DNO_DOUBLE_SUPPORT)
# Set the computecpp device compiler flags to also define NO_DOUBLE_SUPPORT
list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -DNO_DOUBLE_SUPPORT)
endif()

# If the user has specified a specific workgroup size for tests, pass that on to the compiler
if(WG_SIZE)
add_definitions(-DWG_SIZE=${WG_SIZE})
list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -DWG_SIZE=${WG_SIZE})
endif()

# If the user has specified that we should use naive gemm, enable that
option(NAIVE_GEMM "Default to naive GEMM implementations" off)
if(NAIVE_GEMM)
add_definitions(-DNAIVE_GEMM)
list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -DNAIVE_GEMM)
endif()

include_directories(${SYCLBLAS_INCLUDE} ${ComputeCpp_INCLUDE_DIRS} ${BLAS_INCLUDE_DIRS})

add_subdirectory(test)
11 changes: 5 additions & 6 deletions bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ find_package(PkgConfig)

message(STATUS "Path to the SYCL implementation ")

set(SYCLBLAS_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/../include")
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/Modules")
set(SYCLBLAS_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../include)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/Modules)

include(FindOpenCL)
include(FindComputeCpp)
include(ConfigureSYCLBLAS)

# Manually search for CLBLAST
# if (DEFINED CLBLAST_ROOT)
Expand Down Expand Up @@ -68,7 +70,4 @@ add_sycl_to_target(
SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/syclblas/syclblas_benchmark_level3.cpp
)

set(COMPUTECPP_DEVICE_COMPILER_FLAGS "${COMPUTECPP_DEVICE_COMPILER_FLAGS} -no-serial-memop -sycl-compress-name")
message(STATUS "${COMPUTECPP_DEVICE_COMPILER_FLAGS}")

include_directories(${SYCLBLAS_INCLUDE} ${COMPUTECPP_INCLUDE_DIRECTORY})
include_directories(${SYCLBLAS_INCLUDE} ${ComputeCpp_INCLUDE_DIRS})
41 changes: 41 additions & 0 deletions cmake/Modules/ConfigureSYCLBLAS.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@

# We add some flags to workaround OpenCL platform bugs, see ComputeCpp documentation
set(COMPUTECPP_DEVICE_COMPILER_FLAGS
"${COMPUTECPP_DEVICE_COMPILER_FLAGS} -no-serial-memop -Xclang -cl-mad-enable -O3")
message(STATUS "${COMPUTECPP_DEVICE_COMPILER_FLAGS}")

# Check to see if we've disabled double support in the tests
option(NO_DOUBLE_SUPPORT "Disable double support when testing." off)
if(NO_DOUBLE_SUPPORT)
# Define NO_DOUBLE_SUPPORT for the host cxx compiler
add_definitions(-DNO_DOUBLE_SUPPORT)
endif()

# If the user has specified a specific workgroup size for tests, pass that on to the compiler
if(WG_SIZE)
add_definitions(-DWG_SIZE=${WG_SIZE})
endif()

# If the user has specified that we should use naive gemm, enable that
option(NAIVE_GEMM "Default to naive GEMM implementations" off)
if(NAIVE_GEMM)
add_definitions(-DNAIVE_GEMM)
endif()

if(DEFINED TARGET)
message(STATUS "TARGET is defined")
if(${TARGET} STREQUAL "INTEL_GPU")
message(STATUS "${TARGET} device is chosen")
add_definitions(-DINTEL_GPU)
# If the user has specified RCAR as a target backend the optimisation for all other device will be disabled
elseif(${TARGET} STREQUAL "RCAR")
message(STATUS "${TARGET} device is chosen")
add_definitions(-DRCAR)
else()
message(STATUS "No specific TARGET is defined. TARGET will be selected at runtime.")
add_definitions(-DDYNAMIC)
endif()
else()
message(STATUS "No specific TARGET is defined. TARGET will be selected at runtime.")
add_definitions(-DDYNAMIC)
endif()
67 changes: 51 additions & 16 deletions include/interface/blas3_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,7 @@ typename Executor::Return_Type _select_gemm(
#ifndef NAIVE_GEMM
#define ENABLE_GEMM_TRANSPOSE(_trans_a, _trans_b) \
if (_TransA == _trans_a && _TransB == _trans_b) { \
if (ex.has_local_memory() && \
(ex.get_device_type() != \
Executor::Queue_Interface_Type::device_type::SYCL_RCAR_CVENGINE) && \
(ex.get_device_type() != \
Executor::Queue_Interface_Type::device_type::SYCL_RCAR_HOST_CPU)) { \
if (ex.has_local_memory()) { \
auto gemm = make_gemm<DoubleBuffer, ConflictA, ConflictB, ClSize, TileT, \
_trans_a, _trans_b>(buffer_a, buffer_b, buffer_c, \
T(_alpha), T(_beta)); \
Expand Down Expand Up @@ -124,31 +120,70 @@ cl::sycl::event _gemm(Executor& ex, char _TransA, char _TransB, IndexType _M,
#define BIND_DATA_SIZE(_m, _n, _k) if (_M == (_m) && _N == (_n) && _K == (_k))

#define BIND_DEFAULT

#define TO_TPARAMS(_wg, _db, _tir, _tic, _twr, _twc) \
/*
* @tparam _tir the number of rows processed by each work item
* @tparam _tic the number of columns processed by each work item
* @tparam _twr the number of item-level tiles within each column of
* block-level tile
* @tparam _twc the number of item-level tiles within each row of
* block-level tile
* @tparam _wg the total number of work-groupsize for the naive algorithm. It
* is only used for the naive algorithm.
* @tparam _clsize the size of the cache line of the architecture in bytes
* (If the value passed in is smaller than the actual cache
* line size, some values fetched will be wasted, which can
* significantly reduce performance. It can be set to a
* multiple of the physical cache line size. In this case, it
* will significantly increase scratchpad memory usage, but
* will result in fewer local barriers.)
* Note:
* _tir * _twr must be equal to _tic * _twc.
* This is ensured iff: (item_rows | wg_cols) and (item_cols | wg_rows)
* _clsize cannot be bigger than _twr * _twc * sizeof(T)
*/
#define TO_TPARAMS(_wg, _db, _clsize, _tir, _tic, _twr, _twc) \
{ \
return _select_gemm<_wg, _db, false, false, 64, \
return _select_gemm<_wg, _db, false, false, _clsize, \
Tile<_tir, _tic, _twr, _twc>>( \
ex, _TrA, _TrB, _M, _N, _K, _alpha, _A, _lda, _B, _ldb, _beta, _C, \
_ldc); \
}
#ifndef NAIVE_GEMM
#if defined(DYNAMIC)
if (ex.get_device_type() ==
Executor::Queue_Interface_Type::device_type::SYCL_INTEL_GPU) {
BIND_DATA_SIZE(1024, 4096, 1024) TO_TPARAMS(128, false, 4, 4, 16, 16);
BIND_DATA_SIZE(10, 1024, 1024) TO_TPARAMS(128, false, 2, 2, 8, 8);
BIND_DEFAULT TO_TPARAMS(128, false, 8, 8, 8, 8);
BIND_DATA_SIZE(1024, 4096, 1024) TO_TPARAMS(128, false, 64, 4, 4, 16, 16);
BIND_DATA_SIZE(10, 1024, 1024) TO_TPARAMS(128, false, 64, 2, 2, 8, 8);
BIND_DEFAULT TO_TPARAMS(128, false, 64, 8, 8, 8, 8);
} else if ((ex.get_device_type() == Executor::Queue_Interface_Type::
device_type::SYCL_RCAR_CVENGINE) &&
device_type::SYCL_RCAR_CVENGINE) ||
(ex.get_device_type() == Executor::Queue_Interface_Type::
device_type::SYCL_RCAR_HOST_CPU)) {
BIND_DEFAULT TO_TPARAMS(32, false, 8, 8, 8, 8);
if (_M < 512 && _N < 512) {
BIND_DEFAULT TO_TPARAMS(32, false, 128, 4, 8, 8, 4);
} else {
BIND_DEFAULT TO_TPARAMS(32, false, 128, 8, 4, 4, 8);
}
} else {
BIND_DATA_SIZE(10, 1024, 1024) TO_TPARAMS(128, true, 64, 1, 1, 16, 16);
BIND_DEFAULT TO_TPARAMS(128, false, 64, 8, 8, 16, 16);
}
#elif defined(INTEL_GPU)
BIND_DATA_SIZE(1024, 4096, 1024) TO_TPARAMS(128, false, 64, 4, 4, 16, 16);
BIND_DATA_SIZE(10, 1024, 1024) TO_TPARAMS(128, false, 64, 2, 2, 8, 8);
BIND_DEFAULT TO_TPARAMS(128, false, 64, 8, 8, 8, 8);
#elif defined(RCAR)
if (_M < 512 && _N < 512) {
BIND_DEFAULT TO_TPARAMS(32, false, 128, 4, 8, 8, 4);
} else {
BIND_DATA_SIZE(10, 1024, 1024) TO_TPARAMS(128, true, 1, 1, 16, 16);
BIND_DEFAULT TO_TPARAMS(128, false, 8, 8, 16, 16);
BIND_DEFAULT TO_TPARAMS(32, false, 128, 8, 4, 4, 8);
}
#else // any other specified devices
BIND_DATA_SIZE(10, 1024, 1024) TO_TPARAMS(128, true, 64, 1, 1, 16, 16);
BIND_DEFAULT TO_TPARAMS(128, false, 64, 8, 8, 16, 16);
#endif
#else
BIND_DEFAULT TO_TPARAMS(WG_SIZE, false, 8, 8, 8, 8);
BIND_DEFAULT TO_TPARAMS(WG_SIZE, false, 64, 8, 8, 8, 8);
#endif

#undef BIND_DATA_SIZE
Expand Down
2 changes: 2 additions & 0 deletions include/operations/blas3_trees.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,15 @@ class GemmFactory {
((n - 1) / big_tile_cols + 1) * tl_rows *
tl_cols);
const cl::sycl::range<1> wgs(wg_size);
#ifdef VERBOSE
std::cout << " M: " << m << " , N " << n
<< " , big_tile_rows: " << big_tile_rows
<< " , big_tile_cols: " << big_tile_cols
<< " , wg_size: " << wg_size << " , nwg : "
<< ((m - 1) / big_tile_rows + 1) * ((n - 1) / big_tile_cols + 1) *
tl_rows * tl_cols
<< std::endl;
#endif
return cl::sycl::nd_range<1>(nwg * wgs, wgs);
}

Expand Down
6 changes: 3 additions & 3 deletions test/unittest/blas3/gemm/blas3_gemm_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@

#include "blas_test.hpp"

#ifndef BlasTypes
#error "BlasTypes not defined before including blas3_gemm_def.hpp"
#endif
#ifndef BlasTypes
#error "BlasTypes not defined before including blas3_gemm_def.hpp"
#endif

TYPED_TEST_CASE(BLAS_Test, BlasTypes);

Expand Down

0 comments on commit 2af4200

Please sign in to comment.