From 6fddc216cabbf7889db710d9ea764be710a212f4 Mon Sep 17 00:00:00 2001 From: tsung-wei-huang Date: Sat, 29 Feb 2020 17:59:40 -0700 Subject: [PATCH] updated cuda --- CMakeLists.txt | 34 ++-- taskflow/core/graph.hpp | 2 +- taskflow/core/taskflow.hpp | 2 +- taskflow/cuda/device.hpp | 70 -------- taskflow/cuda/error.hpp | 31 ---- taskflow/cuda/flow_builder.hpp | 168 ------------------ taskflow/cuda/graph.hpp | 204 ---------------------- taskflow/cuda/task.hpp | 222 ------------------------ unittests/cuda/basics.cu | 306 --------------------------------- 9 files changed, 19 insertions(+), 1020 deletions(-) delete mode 100644 taskflow/cuda/device.hpp delete mode 100644 taskflow/cuda/error.hpp delete mode 100644 taskflow/cuda/flow_builder.hpp delete mode 100644 taskflow/cuda/graph.hpp delete mode 100644 taskflow/cuda/task.hpp delete mode 100644 unittests/cuda/basics.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 51f85bdf8..b90ce3e00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -472,24 +472,24 @@ if(${TF_ENABLE_CUDA}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_UTEST_DIR}/cuda) -add_executable(cuda-basics ${TF_UTEST_DIR}/cuda/basics.cu) +add_executable(cuda_basics ${TF_UTEST_DIR}/cuda/cuda_basics.cu) target_link_libraries( - cuda-basics ${PROJECT_NAME} Threads::Threads tf::default_settings -) -target_include_directories(cuda-basics PRIVATE ${TF_3RD_PARTY_DIR}/doctest) -add_test(cuda-basics.builder ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Builder) -add_test(cuda-basics.set.i8 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Set.i8) -add_test(cuda-basics.set.i16 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Set.i16) -add_test(cuda-basics.set.i32 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Set.i32) -add_test(cuda-basics.bset.i8 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=BSet.i8) -add_test(cuda-basics.bset.i16 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=BSet.i16) -add_test(cuda-basics.bset.i32 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=BSet.i32) -add_test(cuda-basics.add.i8 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Add.i8) -add_test(cuda-basics.add.i16 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Add.i16) -add_test(cuda-basics.add.i32 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Add.i32) -add_test(cuda-basics.barrier.i8 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Barrier.i8) -add_test(cuda-basics.barrier.i16 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Barrier.i16) -add_test(cuda-basics.barrier.i32 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Barrier.i32) + cuda_basics ${PROJECT_NAME} Threads::Threads tf::default_settings +) +target_include_directories(cuda_basics PRIVATE ${TF_3RD_PARTY_DIR}/doctest) +add_test(cuda_basics.builder ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Builder) +add_test(cuda_basics.set.i8 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Set.i8) +add_test(cuda_basics.set.i16 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Set.i16) +add_test(cuda_basics.set.i32 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Set.i32) +add_test(cuda_basics.bset.i8 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=BSet.i8) +add_test(cuda_basics.bset.i16 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=BSet.i16) +add_test(cuda_basics.bset.i32 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=BSet.i32) +add_test(cuda_basics.add.i8 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Add.i8) +add_test(cuda_basics.add.i16 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Add.i16) +add_test(cuda_basics.add.i32 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Add.i32) +add_test(cuda_basics.barrier.i8 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Barrier.i8) +add_test(cuda_basics.barrier.i16 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Barrier.i16) +add_test(cuda_basics.barrier.i32 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Barrier.i32) endif(${TF_ENABLE_CUDA}) diff --git a/taskflow/core/graph.hpp b/taskflow/core/graph.hpp index b912eedc3..5296ca6e3 100644 --- a/taskflow/core/graph.hpp +++ b/taskflow/core/graph.hpp @@ -9,7 +9,7 @@ #if defined(__CUDA__) || defined(__CUDACC__) #define TF_ENABLE_CUDA -#include "../cuda/flow_builder.hpp" +#include "../cuda/cuda_flow_builder.hpp" #endif namespace tf { diff --git a/taskflow/core/taskflow.hpp b/taskflow/core/taskflow.hpp index 63b95fcd0..89c05dabb 100644 --- a/taskflow/core/taskflow.hpp +++ b/taskflow/core/taskflow.hpp @@ -206,7 +206,7 @@ inline void Taskflow::_dump( #ifdef TF_ENABLE_CUDA case Node::CUDAFLOW_WORK: - os << "shape=folder fillcolor=gray style=filled"; + os << "shape=folder fillcolor=cyan style=filled"; break; #endif diff --git a/taskflow/cuda/device.hpp b/taskflow/cuda/device.hpp deleted file mode 100644 index c763bbe5e..000000000 --- a/taskflow/cuda/device.hpp +++ /dev/null @@ -1,70 +0,0 @@ -#pragma once - -#include "error.hpp" - -namespace tf { - -/** -@brief queries the number of available devices -*/ -inline unsigned cuda_num_devices() { - int N = 0; - TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count"); - return N; -} - -/** -@brief gets the current device associated with the caller thread -*/ -inline int cuda_get_device() { - int id; - TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id"); - return id; -} - -/** -@brief switches to a given device context -*/ -inline void cuda_set_device(int id) { - TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id); -} - -/** @class cudaScopedDevice - -@brief RAII-style device context switch - -*/ -class cudaScopedDevice { - - public: - - cudaScopedDevice(int); - ~cudaScopedDevice(); - - private: - - int _p; -}; - -// Constructor -inline cudaScopedDevice::cudaScopedDevice(int dev) { - TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope"); - if(_p == dev) { - _p = -1; - } - else { - TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev); - } -} - -// Destructor -inline cudaScopedDevice::~cudaScopedDevice() { - if(_p != -1) { - cudaSetDevice(_p); - //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p); - } -} - -} // end of namespace cuda --------------------------------------------------- - - diff --git a/taskflow/cuda/error.hpp b/taskflow/cuda/error.hpp deleted file mode 100644 index a2b679571..000000000 --- a/taskflow/cuda/error.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include "../utility/stringify.hpp" - -#define TF_CUDA_REMOVE_FIRST_HELPER(N, ...) __VA_ARGS__ -#define TF_CUDA_REMOVE_FIRST(...) TF_CUDA_REMOVE_FIRST_HELPER(__VA_ARGS__) -#define TF_CUDA_GET_FIRST_HELPER(N, ...) N -#define TF_CUDA_GET_FIRST(...) TF_CUDA_GET_FIRST_HELPER(__VA_ARGS__) - -#define TF_CHECK_CUDA(...) \ -if(TF_CUDA_GET_FIRST(__VA_ARGS__) != cudaSuccess) { \ - std::ostringstream oss; \ - auto ev = TF_CUDA_GET_FIRST(__VA_ARGS__); \ - auto unknown_str = "unknown error"; \ - auto unknown_name = "cudaErrorUnknown"; \ - auto error_str = ::cudaGetErrorString(ev); \ - auto error_name = ::cudaGetErrorName(ev); \ - oss << "[" << __FILE__ << ":" << __LINE__ << "] " \ - << (error_str ? error_str : unknown_str) \ - << " (" \ - << (error_name ? error_name : unknown_name) \ - << ") - "; \ - tf::ostreamize(oss, TF_CUDA_REMOVE_FIRST(__VA_ARGS__)); \ - throw std::runtime_error(oss.str()); \ -} - diff --git a/taskflow/cuda/flow_builder.hpp b/taskflow/cuda/flow_builder.hpp deleted file mode 100644 index 640532c24..000000000 --- a/taskflow/cuda/flow_builder.hpp +++ /dev/null @@ -1,168 +0,0 @@ -#pragma once - -#include "task.hpp" - -namespace tf { - -/** -@class cudaFlow - -@brief Building methods of a cuda task dependency graph. -*/ -class cudaFlow { - - public: - - /** - @brief constructs a cudaFlow builder object - - @param graph a cudaGraph to manipulate - */ - cudaFlow(cudaGraph& graph); - - /** - @brief queries the emptiness of the graph - */ - bool empty() const; - - /** - @brief creates a placeholder task - */ - cudaTask placeholder(); - - /** - @brief creates a no-operation task - - An empty node performs no operation during execution, - but can be used for transitive ordering. - For example, a phased execution graph with 2 groups of n nodes - with a barrier between them can be represented using an empty node - and 2*n dependency edges, - rather than no empty node and n^2 dependency edges. - */ - cudaTask noop(); - - /** - @brief creates a kernel task - - @tparam F kernel function type - @tparam ArgsT kernel function parameters type - - @param g configured grid - @param b configured block - @param s configured shared memory - @param f kernel function - @param args arguments to forward to the kernel function by copy - - @return cudaTask handle - */ - template - cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args); - - /** - @brief creates an 1D copy task - - @tparam T element type (non-void) - - @param tgt pointer to the target memory block - @param src pointer to the source memory block - @param num number of elements to copy - - @return cudaTask handle - - A copy task transfers num*sizeof(T) bytes of data from a source location - to a target location. Direction can be either cpu-to-gpu, gpu-to-cpu, - gpu-to-gpu, or gpu-to-cpu. - */ - template < - typename T, - std::enable_if_t::value, void>* = nullptr - > - cudaTask copy(T* tgt, T* src, size_t num); - - private: - - cudaGraph& _graph; -}; - -// Constructor -inline cudaFlow::cudaFlow(cudaGraph& g) : _graph {g} { -} - -// Function: empty -inline bool cudaFlow::empty() const { - return _graph._nodes.empty(); -} - -// Function: noop -inline cudaTask cudaFlow::noop() { - auto node = _graph.emplace_back(); - node->_handle.emplace(); - TF_CHECK_CUDA( - ::cudaGraphAddEmptyNode(&node->_node, _graph._handle, nullptr, 0), - "failed to create a no-operation (empty) node" - ); - return cudaTask(node); -} - -// Function: kernel -template -cudaTask cudaFlow::kernel( - dim3 grid, dim3 block, size_t shm, F&& func, ArgsT&&... args -) { - - using traits = function_traits; - - static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches"); - - void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... }; - - auto node = _graph.emplace_back(); - - auto& p = node->_handle.emplace().param; - - p.func = (void*)func; - p.gridDim = grid; - p.blockDim = block; - p.sharedMemBytes = shm; - p.kernelParams = arguments; - p.extra = nullptr; - - TF_CHECK_CUDA( - ::cudaGraphAddKernelNode(&node->_node, _graph._handle, nullptr, 0, &p), - "failed to create a cudaKernel node" - ); - - return cudaTask(node); -} - -// Function: copy -template < - typename T, - std::enable_if_t::value, void>* -> -cudaTask cudaFlow::copy(T* tgt, T* src, size_t num) { - - using U = std::decay_t; - - auto node = _graph.emplace_back(); - auto& p = node->_handle.emplace().param; - - p.srcArray = nullptr; - p.srcPos = ::make_cudaPos(0, 0, 0); - p.srcPtr = ::make_cudaPitchedPtr(src, num*sizeof(U), num, 1); - p.dstArray = nullptr; - p.dstPos = ::make_cudaPos(0, 0, 0); - p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1); - p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1); - p.kind = cudaMemcpyDefault; - - TF_CHECK_CUDA( - cudaGraphAddMemcpyNode(&node->_node, _graph._handle, nullptr, 0, &p), - "failed to create a cudaCopy node" - ); - - return cudaTask(node); -} - -} // end of namespace tf ----------------------------------------------------- diff --git a/taskflow/cuda/graph.hpp b/taskflow/cuda/graph.hpp deleted file mode 100644 index f6dda2590..000000000 --- a/taskflow/cuda/graph.hpp +++ /dev/null @@ -1,204 +0,0 @@ -#pragma once - -#include "device.hpp" - -#include "../utility/object_pool.hpp" -#include "../utility/traits.hpp" -#include "../utility/passive_vector.hpp" -#include "../nstd/variant.hpp" - -namespace tf { - -// ---------------------------------------------------------------------------- -// cudaNode class -// ---------------------------------------------------------------------------- - -// class: cudaNode -class cudaNode { - - friend class cudaFlow; - friend class cudaGraph; - friend class cudaTask; - - friend class Taskflow; - friend class Executor; - - // Host handle - //struct Host { - // cudaHostNodeParams param; - //}; - - struct Noop { - - }; - - // Copy handle - struct Copy { - - template - Copy(ArgsT&&...); - - cudaMemcpy3DParms param; - }; - - // Kernel handle - struct Kernel { - - template - Kernel(ArgsT&&...); - - cudaKernelNodeParams param; - }; - - using handle_t = nstd::variant; - - // variant index - constexpr static auto NOOP = get_index_v; - constexpr static auto COPY = get_index_v; - constexpr static auto KERNEL = get_index_v; - - public: - - cudaNode(cudaGraph&); - - private: - - cudaGraph& _graph; - - std::string _name; - - handle_t _handle; - - cudaGraphNode_t _node {nullptr}; - - PassiveVector _successors; - - void _precede(cudaNode*); -}; - -// ---------------------------------------------------------------------------- -// cudaGraph class -// ---------------------------------------------------------------------------- - -// class: cudaGraph -class cudaGraph { - - friend class cudaFlow; - friend class cudaNode; - friend class cudaTask; - - friend class Taskflow; - friend class Executor; - - public: - - cudaGraph(); - ~cudaGraph(); - - template - cudaNode* emplace_back(ArgsT&&...); - - cudaGraph_t native_handle(); - - void clear(); - - bool empty() const; - - private: - - cudaGraph_t _handle {nullptr}; - - std::vector> _nodes; -}; - -// ---------------------------------------------------------------------------- -// cudaNode definitions -// ---------------------------------------------------------------------------- - -// Copy handle constructor -template -cudaNode::Copy::Copy(ArgsT&&... args) { -} - -// Kernel handle constructor -template -cudaNode::Kernel::Kernel(ArgsT&&... args) { -} - -// Constructor -inline cudaNode::cudaNode(cudaGraph& g) : _graph {g} { -} - -// Procedure: _precede -inline void cudaNode::_precede(cudaNode* v) { - _successors.push_back(v); - TF_CHECK_CUDA( - ::cudaGraphAddDependencies(_graph._handle, &_node, &(v->_node), 1), - "failed to add a preceding link" - ); -} - -// ---------------------------------------------------------------------------- -// cudaGraph definitions -// ---------------------------------------------------------------------------- - -// Constructor -inline cudaGraph::cudaGraph() { - TF_CHECK_CUDA(cudaGraphCreate(&_handle, 0), "failed to create a cudaGraph"); -} - -// Destructor -inline cudaGraph::~cudaGraph() { - cudaGraphDestroy(_handle); -} - -// Function: empty -inline bool cudaGraph::empty() const { - return _nodes.empty(); -} - -// Procedure: clear -inline void cudaGraph::clear() { - - _nodes.clear(); - - cudaGraphDestroy(_handle); - TF_CHECK_CUDA( - cudaGraphCreate(&_handle, 0), - "failed to create a cudaGraph after clear" - ); -} - -// Function: emplace_back -template -cudaNode* cudaGraph::emplace_back(ArgsT&&... args) { - auto node = std::make_unique(*this, std::forward(args)...); - _nodes.emplace_back(std::move(node)); - return _nodes.back().get(); -} - -// Function: native_handle -inline cudaGraph_t cudaGraph::native_handle() { - return _handle; -} - - -//inline void cudaGraph::run() { -// cudaGraphExec_t graphExec; -// TF_CHECK_CUDA( -// cudaGraphInstantiate(&graphExec, _handle, nullptr, nullptr, 0), -// "failed to create an executable cudaGraph" -// ); -// TF_CHECK_CUDA(cudaGraphLaunch(graphExec, 0), "failed to launch cudaGraph") -// TF_CHECK_CUDA(cudaStreamSynchronize(0), "failed to sync cudaStream"); -// TF_CHECK_CUDA( -// cudaGraphExecDestroy(graphExec), "failed to destroy an executable cudaGraph" -// ); -//} - - - - - -} // end of namespace tf ----------------------------------------------------- - diff --git a/taskflow/cuda/task.hpp b/taskflow/cuda/task.hpp deleted file mode 100644 index ae0bf1cc6..000000000 --- a/taskflow/cuda/task.hpp +++ /dev/null @@ -1,222 +0,0 @@ -#pragma once - -#include "graph.hpp" - -namespace tf { - -/** -@class Task - -@brief task handle to a node in a cudaGraph -*/ -class cudaTask { - - friend class cudaFlow; - - public: - - /** - @brief constructs an empty cudaTask - */ - cudaTask() = default; - - /** - @brief copy-constructs a cudaTask - */ - cudaTask(const cudaTask&) = default; - - /** - @brief copy-assigns a cudaTask - */ - cudaTask& operator = (const cudaTask&) = default; - - /** - @brief adds precedence links from this to other tasks - - @tparam Ts... parameter pack - - @param tasks one or multiple tasks - - @return @c *this - */ - template - cudaTask& precede(Ts&&... tasks); - - /** - @brief adds precedence links from other tasks to this - - @tparam Ts... parameter pack - - @param tasks one or multiple tasks - - @return @c *this - */ - template - cudaTask& succeed(Ts&&... tasks); - - /** - @brief assigns a name to the task - - @param name a @std_string acceptable string - - @return @c *this - */ - cudaTask& name(const std::string& name); - - /** - @brief queries the name of the task - */ - const std::string& name() const; - - /** - @brief queries the number of successors - */ - size_t num_successors() const; - - /** - @brief queries if the task is associated with a cudaNode - */ - bool empty() const; - - private: - - cudaTask(cudaNode*); - - cudaNode* _node {nullptr}; - - template - void _precede(T&&); - - template - void _precede(T&&, Ts&&...); - - template - void _succeed(T&&); - - template - void _succeed(T&&, Ts&&...); -}; - -// Constructor -inline cudaTask::cudaTask(cudaNode* node) : _node {node} { -} - -// Function: precede -template -cudaTask& cudaTask::precede(Ts&&... tasks) { - _precede(std::forward(tasks)...); - return *this; -} - -// Procedure: precede -template -void cudaTask::_precede(T&& other) { - _node->_precede(other._node); -} - -// Procedure: _precede -template -void cudaTask::_precede(T&& task, Ts&&... others) { - _precede(std::forward(task)); - _precede(std::forward(others)...); -} - -// Function: succeed -template -cudaTask& cudaTask::succeed(Ts&&... tasks) { - _succeed(std::forward(tasks)...); - return *this; -} - -// Procedure: succeed -template -void cudaTask::_succeed(T&& other) { - other._node->_precede(_node); -} - -// Procedure: _succeed -template -void cudaTask::_succeed(T&& task, Ts&&... others) { - _succeed(std::forward(task)); - _succeed(std::forward(others)...); -} - -// Function: empty -inline bool cudaTask::empty() const { - return _node == nullptr; -} - -// Function: name -inline cudaTask& cudaTask::name(const std::string& name) { - _node->_name = name; - return *this; -} - -// Function: name -inline const std::string& cudaTask::name() const { - return _node->_name; -} - -// Function: num_successors -inline size_t cudaTask::num_successors() const { - return _node->_successors.size(); -} - -//// Function: kernel -//template -//cudaTask& cudaTask::kernel( -// dim3 grid, dim3 block, size_t shm, F&& func, ArgsT&&... args -//) { -// -// using traits = function_traits; -// -// static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches"); -// -// void* arguments[sizeof...(ArgsT)] = { &args... }; -// -// auto& p = _node->_handle.emplace().param; -// -// p.func = (void*)func; -// p.gridDim = grid; -// p.blockDim = block; -// p.sharedMemBytes = shm; -// p.kernelParams = arguments; -// p.extra = nullptr; -// -// TF_CHECK_CUDA( -// ::cudaGraphAddKernelNode(&_node->_node, _node->_graph._handle, nullptr, 0, &p), -// "failed to create a cudaKernel node" -// ); -// -// return *this; -//} -// -//// Function: copy -//template < -// typename T, -// std::enable_if_t::value, void>* -//> -//cudaTask& cudaTask::copy(T* tgt, T* src, size_t num) { -// -// using U = std::decay_t; -// -// auto& p = _node->_handle.emplace().param; -// -// p.srcArray = nullptr; -// p.srcPos = ::make_cudaPos(0, 0, 0); -// p.srcPtr = ::make_cudaPitchedPtr(src, num*sizeof(U), num, 1); -// p.dstArray = nullptr; -// p.dstPos = ::make_cudaPos(0, 0, 0); -// p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1); -// p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1); -// p.kind = cudaMemcpyDefault; -// -// TF_CHECK_CUDA( -// cudaGraphAddMemcpyNode(&_node->_node, _node->_graph._handle, nullptr, 0, &p), -// "failed to create a cudaCopy node" -// ); -// -// return *this; -//} - -} // end of namespace tf ----------------------------------------------------- diff --git a/unittests/cuda/basics.cu b/unittests/cuda/basics.cu deleted file mode 100644 index 737c2cdaf..000000000 --- a/unittests/cuda/basics.cu +++ /dev/null @@ -1,306 +0,0 @@ -#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN - -#include -#include - -// ---------------------------------------------------------------------------- -// kernel helper -// ---------------------------------------------------------------------------- -template -__global__ void k_set(T* ptr, size_t N, T value) { - int i = blockIdx.x*blockDim.x + threadIdx.x; - if (i < N) { - ptr[i] = value; - } -} - -template -__global__ void k_single_set(T* ptr, int i, T value) { - ptr[i] = value; -} - -template -__global__ void k_add(T* ptr, size_t N, T value) { - int i = blockIdx.x*blockDim.x + threadIdx.x; - if (i < N) { - ptr[i] += value; - } -} - -template -__global__ void k_single_add(T* ptr, int i, T value) { - ptr[i] += value; -} - -// -------------------------------------------------------- -// standalone execution -// -------------------------------------------------------- -void run(tf::cudaGraph& G) { - cudaGraphExec_t graphExec; - TF_CHECK_CUDA( - cudaGraphInstantiate(&graphExec, G.native_handle(), nullptr, nullptr, 0), - "failed to create an executable cudaGraph" - ); - TF_CHECK_CUDA(cudaGraphLaunch(graphExec, 0), "failed to launch cudaGraph") - TF_CHECK_CUDA(cudaStreamSynchronize(0), "failed to sync cudaStream"); - TF_CHECK_CUDA( - cudaGraphExecDestroy(graphExec), "failed to destroy an executable cudaGraph" - ); -} - -// -------------------------------------------------------- -// Testcase: Builder -// -------------------------------------------------------- -TEST_CASE("Builder" * doctest::timeout(300)) { - - tf::cudaGraph G; - tf::cudaFlow cf(G); - - int source = 1; - int target = 1; - - auto copy1 = cf.copy(&target, &source, 1).name("copy1"); - auto copy2 = cf.copy(&target, &source, 1).name("copy2"); - auto copy3 = cf.copy(&target, &source, 1).name("copy3"); - - REQUIRE(copy1.name() == "copy1"); - REQUIRE(copy2.name() == "copy2"); - REQUIRE(copy3.name() == "copy3"); - - REQUIRE(!copy1.empty()); - REQUIRE(!copy2.empty()); - REQUIRE(!copy3.empty()); - - copy1.precede(copy2); - copy2.succeed(copy3); - - REQUIRE(copy1.num_successors() == 1); - REQUIRE(copy2.num_successors() == 0); - REQUIRE(copy3.num_successors() == 1); -} - -// -------------------------------------------------------- -// Testcase: Set -// -------------------------------------------------------- -template -void set() { - - for(unsigned n=1; n<=1345678; n = n*2 + 1) { - tf::cudaGraph G; - tf::cudaFlow cf(G); - - T* cpu = nullptr; - T* gpu = nullptr; - - cpu = static_cast(std::calloc(n, sizeof(T))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); - - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto kernel = cf.kernel(g, b, 0, k_set, gpu, n, (T)17); - auto d2h = cf.copy(cpu, gpu, n); - h2d.precede(kernel); - kernel.precede(d2h); - - run(G); - - for(unsigned i=0; i(); -} - -TEST_CASE("Set.i16" * doctest::timeout(300)) { - set(); -} - -TEST_CASE("Set.i32" * doctest::timeout(300)) { - set(); -} - -// -------------------------------------------------------- -// Testcase: Add -// -------------------------------------------------------- -template -void add() { - - for(unsigned n=1; n<=1345678; n = n*2 + 1) { - tf::cudaGraph G; - tf::cudaFlow cf(G); - - T* cpu = nullptr; - T* gpu = nullptr; - - cpu = static_cast(std::calloc(n, sizeof(T))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); - - dim3 g = {(n+255)/256, 1, 1}; - dim3 b = {256, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto ad1 = cf.kernel(g, b, 0, k_add, gpu, n, 1); - auto ad2 = cf.kernel(g, b, 0, k_add, gpu, n, 2); - auto ad3 = cf.kernel(g, b, 0, k_add, gpu, n, 3); - auto ad4 = cf.kernel(g, b, 0, k_add, gpu, n, 4); - auto d2h = cf.copy(cpu, gpu, n); - h2d.precede(ad1); - ad1.precede(ad2); - ad2.precede(ad3); - ad3.precede(ad4); - ad4.precede(d2h); - - run(G); - - for(unsigned i=0; i(); -} - -TEST_CASE("Add.i16" * doctest::timeout(300)) { - add(); -} - -TEST_CASE("Add.i32" * doctest::timeout(300)) { - add(); -} - -// TODO: 64-bit fail? -//TEST_CASE("Add.i64" * doctest::timeout(300)) { -// add(); -//} - - -// -------------------------------------------------------- -// Testcase: Binary Set -// -------------------------------------------------------- -template -void bset() { - - const unsigned n = 10000; - - tf::cudaGraph G; - tf::cudaFlow cf(G); - - T* cpu = nullptr; - T* gpu = nullptr; - - cpu = static_cast(std::calloc(n, sizeof(T))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); - - dim3 g = {1, 1, 1}; - dim3 b = {1, 1, 1}; - auto h2d = cf.copy(gpu, cpu, n); - auto d2h = cf.copy(cpu, gpu, n); - - std::vector tasks(n+1); - - for(unsigned i=1; i<=n; ++i) { - tasks[i] = cf.kernel(g, b, 0, k_single_set, gpu, i-1, (T)17); - - auto p = i/2; - if(p != 0) { - tasks[p].precede(tasks[i]); - } - - tasks[i].precede(d2h); - h2d.precede(tasks[i]); - } - - run(G); - - for(unsigned i=0; i(); -} - -TEST_CASE("BSet.i16" * doctest::timeout(300)) { - bset(); -} - -TEST_CASE("BSet.i32" * doctest::timeout(300)) { - bset(); -} - -// -------------------------------------------------------- -// Testcase: Barrier -// -------------------------------------------------------- -template -void barrier() { - - const unsigned n = 1000; - - tf::cudaGraph G; - tf::cudaFlow cf(G); - - T* cpu = nullptr; - T* gpu = nullptr; - - cpu = static_cast(std::calloc(n, sizeof(T))); - REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess); - - dim3 g = {1, 1, 1}; - dim3 b = {1, 1, 1}; - auto br1 = cf.noop(); - auto br2 = cf.noop(); - auto br3 = cf.noop(); - auto h2d = cf.copy(gpu, cpu, n); - auto d2h = cf.copy(cpu, gpu, n); - - h2d.precede(br1); - - for(unsigned i=0; i, gpu, i, (T)17); - k1.succeed(br1) - .precede(br2); - - auto k2 = cf.kernel(g, b, 0, k_single_add, gpu, i, (T)3); - k2.succeed(br2) - .precede(br3); - } - - br3.precede(d2h); - - run(G); - - for(unsigned i=0; i(); -} - -TEST_CASE("Barrier.i16" * doctest::timeout(300)) { - barrier(); -} - -TEST_CASE("Barrier.i32" * doctest::timeout(300)) { - barrier(); -} -