From 6fddc216cabbf7889db710d9ea764be710a212f4 Mon Sep 17 00:00:00 2001
From: tsung-wei-huang <tsung-wei.huang@utah.edu>
Date: Sat, 29 Feb 2020 17:59:40 -0700
Subject: [PATCH] updated cuda

---
 CMakeLists.txt                 |  34 ++--
 taskflow/core/graph.hpp        |   2 +-
 taskflow/core/taskflow.hpp     |   2 +-
 taskflow/cuda/device.hpp       |  70 --------
 taskflow/cuda/error.hpp        |  31 ----
 taskflow/cuda/flow_builder.hpp | 168 ------------------
 taskflow/cuda/graph.hpp        | 204 ----------------------
 taskflow/cuda/task.hpp         | 222 ------------------------
 unittests/cuda/basics.cu       | 306 ---------------------------------
 9 files changed, 19 insertions(+), 1020 deletions(-)
 delete mode 100644 taskflow/cuda/device.hpp
 delete mode 100644 taskflow/cuda/error.hpp
 delete mode 100644 taskflow/cuda/flow_builder.hpp
 delete mode 100644 taskflow/cuda/graph.hpp
 delete mode 100644 taskflow/cuda/task.hpp
 delete mode 100644 unittests/cuda/basics.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51f85bdf8..b90ce3e00 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -472,24 +472,24 @@ if(${TF_ENABLE_CUDA})
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${TF_UTEST_DIR}/cuda)
 
-add_executable(cuda-basics ${TF_UTEST_DIR}/cuda/basics.cu)
+add_executable(cuda_basics ${TF_UTEST_DIR}/cuda/cuda_basics.cu)
 target_link_libraries(
-  cuda-basics ${PROJECT_NAME} Threads::Threads tf::default_settings
-)
-target_include_directories(cuda-basics PRIVATE ${TF_3RD_PARTY_DIR}/doctest)
-add_test(cuda-basics.builder ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Builder)
-add_test(cuda-basics.set.i8  ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Set.i8)
-add_test(cuda-basics.set.i16 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Set.i16)
-add_test(cuda-basics.set.i32 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Set.i32)
-add_test(cuda-basics.bset.i8  ${TF_UTEST_DIR}/cuda/cuda-basics -tc=BSet.i8)
-add_test(cuda-basics.bset.i16 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=BSet.i16)
-add_test(cuda-basics.bset.i32 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=BSet.i32)
-add_test(cuda-basics.add.i8  ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Add.i8)
-add_test(cuda-basics.add.i16 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Add.i16)
-add_test(cuda-basics.add.i32 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Add.i32)
-add_test(cuda-basics.barrier.i8 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Barrier.i8)
-add_test(cuda-basics.barrier.i16 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Barrier.i16)
-add_test(cuda-basics.barrier.i32 ${TF_UTEST_DIR}/cuda/cuda-basics -tc=Barrier.i32)
+  cuda_basics ${PROJECT_NAME} Threads::Threads tf::default_settings
+)
+target_include_directories(cuda_basics PRIVATE ${TF_3RD_PARTY_DIR}/doctest)
+add_test(cuda_basics.builder ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Builder)
+add_test(cuda_basics.set.i8  ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Set.i8)
+add_test(cuda_basics.set.i16 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Set.i16)
+add_test(cuda_basics.set.i32 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Set.i32)
+add_test(cuda_basics.bset.i8  ${TF_UTEST_DIR}/cuda/cuda_basics -tc=BSet.i8)
+add_test(cuda_basics.bset.i16 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=BSet.i16)
+add_test(cuda_basics.bset.i32 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=BSet.i32)
+add_test(cuda_basics.add.i8  ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Add.i8)
+add_test(cuda_basics.add.i16 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Add.i16)
+add_test(cuda_basics.add.i32 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Add.i32)
+add_test(cuda_basics.barrier.i8 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Barrier.i8)
+add_test(cuda_basics.barrier.i16 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Barrier.i16)
+add_test(cuda_basics.barrier.i32 ${TF_UTEST_DIR}/cuda/cuda_basics -tc=Barrier.i32)
 
 endif(${TF_ENABLE_CUDA})
 
diff --git a/taskflow/core/graph.hpp b/taskflow/core/graph.hpp
index b912eedc3..5296ca6e3 100644
--- a/taskflow/core/graph.hpp
+++ b/taskflow/core/graph.hpp
@@ -9,7 +9,7 @@
 
 #if defined(__CUDA__) || defined(__CUDACC__)
 #define TF_ENABLE_CUDA
-#include "../cuda/flow_builder.hpp"
+#include "../cuda/cuda_flow_builder.hpp"
 #endif
 
 namespace tf {
diff --git a/taskflow/core/taskflow.hpp b/taskflow/core/taskflow.hpp
index 63b95fcd0..89c05dabb 100644
--- a/taskflow/core/taskflow.hpp
+++ b/taskflow/core/taskflow.hpp
@@ -206,7 +206,7 @@ inline void Taskflow::_dump(
 
 #ifdef TF_ENABLE_CUDA
     case Node::CUDAFLOW_WORK:
-      os << "shape=folder fillcolor=gray style=filled";
+      os << "shape=folder fillcolor=cyan style=filled";
     break;
 #endif
 
diff --git a/taskflow/cuda/device.hpp b/taskflow/cuda/device.hpp
deleted file mode 100644
index c763bbe5e..000000000
--- a/taskflow/cuda/device.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#pragma once
-
-#include "error.hpp"
-
-namespace tf { 
-
-/**
-@brief queries the number of available devices
-*/
-inline unsigned cuda_num_devices() {
-	int N = 0;
-  TF_CHECK_CUDA(cudaGetDeviceCount(&N), "failed to get device count");
-	return N;
-}
-
-/**
-@brief gets the current device associated with the caller thread
-*/
-inline int cuda_get_device() {
-  int id;
-  TF_CHECK_CUDA(cudaGetDevice(&id), "failed to get current device id");
-	return id;
-}
-
-/**
-@brief switches to a given device context
-*/
-inline void cuda_set_device(int id) {
-  TF_CHECK_CUDA(cudaSetDevice(id), "failed to switch to device ", id);
-}
-
-/** @class cudaScopedDevice
-
-@brief RAII-style device context switch
-
-*/
-class cudaScopedDevice {
-
-  public:
-    
-    cudaScopedDevice(int);
-    ~cudaScopedDevice();
-
-  private:
-
-    int _p;
-};
-
-// Constructor
-inline cudaScopedDevice::cudaScopedDevice(int dev) { 
-  TF_CHECK_CUDA(cudaGetDevice(&_p), "failed to get current device scope");
-  if(_p == dev) {
-    _p = -1;
-  }
-  else {
-    TF_CHECK_CUDA(cudaSetDevice(dev), "failed to scope on device ", dev);
-  }
-}
-
-// Destructor
-inline cudaScopedDevice::~cudaScopedDevice() { 
-  if(_p != -1) {
-    cudaSetDevice(_p);
-    //TF_CHECK_CUDA(cudaSetDevice(_p), "failed to scope back to device ", _p);
-  }
-}
-
-}  // end of namespace cuda ---------------------------------------------------
-
-
diff --git a/taskflow/cuda/error.hpp b/taskflow/cuda/error.hpp
deleted file mode 100644
index a2b679571..000000000
--- a/taskflow/cuda/error.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#pragma once
-
-#include <cuda.h>
-#include <iostream>
-#include <sstream>
-#include <exception>
-
-#include "../utility/stringify.hpp"
-
-#define TF_CUDA_REMOVE_FIRST_HELPER(N, ...) __VA_ARGS__
-#define TF_CUDA_REMOVE_FIRST(...) TF_CUDA_REMOVE_FIRST_HELPER(__VA_ARGS__)
-#define TF_CUDA_GET_FIRST_HELPER(N, ...) N
-#define TF_CUDA_GET_FIRST(...) TF_CUDA_GET_FIRST_HELPER(__VA_ARGS__)
-
-#define TF_CHECK_CUDA(...)                                       \
-if(TF_CUDA_GET_FIRST(__VA_ARGS__) != cudaSuccess) {              \
-  std::ostringstream oss;                                        \
-  auto ev = TF_CUDA_GET_FIRST(__VA_ARGS__);                      \
-  auto unknown_str  = "unknown error";                           \
-  auto unknown_name = "cudaErrorUnknown";                        \
-  auto error_str  = ::cudaGetErrorString(ev);                    \
-  auto error_name = ::cudaGetErrorName(ev);                      \
-  oss << "[" << __FILE__ << ":" << __LINE__ << "] "              \
-      << (error_str  ? error_str  : unknown_str)                 \
-      << " ("                                                    \
-      << (error_name ? error_name : unknown_name)                \
-      << ") - ";                                                 \
-  tf::ostreamize(oss, TF_CUDA_REMOVE_FIRST(__VA_ARGS__));  \
-  throw std::runtime_error(oss.str());                           \
-}
-
diff --git a/taskflow/cuda/flow_builder.hpp b/taskflow/cuda/flow_builder.hpp
deleted file mode 100644
index 640532c24..000000000
--- a/taskflow/cuda/flow_builder.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-#pragma once
-
-#include "task.hpp"
-
-namespace tf {
-
-/**
-@class cudaFlow
-
-@brief Building methods of a cuda task dependency graph.
-*/
-class cudaFlow {
-
-  public:
-    
-    /**
-    @brief constructs a cudaFlow builder object
-
-    @param graph a cudaGraph to manipulate
-    */
-    cudaFlow(cudaGraph& graph);
-
-    /**
-    @brief queries the emptiness of the graph
-    */
-    bool empty() const;
-    
-    /**
-    @brief creates a placeholder task
-    */
-    cudaTask placeholder();
-
-    /**
-    @brief creates a no-operation task
-
-    An empty node performs no operation during execution, 
-    but can be used for transitive ordering. 
-    For example, a phased execution graph with 2 groups of n nodes 
-    with a barrier between them can be represented using an empty node 
-    and 2*n dependency edges, 
-    rather than no empty node and n^2 dependency edges.
-    */
-    cudaTask noop();
-    
-    /**
-    @brief creates a kernel task
-    
-    @tparam F kernel function type
-    @tparam ArgsT kernel function parameters type
-
-    @param g configured grid
-    @param b configured block
-    @param s configured shared memory
-    @param f kernel function
-    @param args arguments to forward to the kernel function by copy
-
-    @return cudaTask handle
-    */
-    template <typename F, typename... ArgsT>
-    cudaTask kernel(dim3 g, dim3 b, size_t s, F&& f, ArgsT&&... args);
-    
-    /**
-    @brief creates an 1D copy task
-    
-    @tparam T element type (non-void)
-
-    @param tgt pointer to the target memory block
-    @param src pointer to the source memory block
-    @param num number of elements to copy
-
-    @return cudaTask handle
-
-    A copy task transfers num*sizeof(T) bytes of data from a source location
-    to a target location. Direction can be either cpu-to-gpu, gpu-to-cpu,
-    gpu-to-gpu, or gpu-to-cpu.
-    */
-    template <
-      typename T, 
-      std::enable_if_t<!std::is_same<T, void>::value, void>* = nullptr
-    >
-    cudaTask copy(T* tgt, T* src, size_t num);
-
-  private:
-
-    cudaGraph& _graph;
-};
-
-// Constructor
-inline cudaFlow::cudaFlow(cudaGraph& g) : _graph {g} {
-}
-
-// Function: empty
-inline bool cudaFlow::empty() const {
-  return _graph._nodes.empty();
-}
-
-// Function: noop
-inline cudaTask cudaFlow::noop() {
-  auto node = _graph.emplace_back();
-  node->_handle.emplace<cudaNode::Noop>();
-  TF_CHECK_CUDA(
-    ::cudaGraphAddEmptyNode(&node->_node, _graph._handle, nullptr, 0),
-    "failed to create a no-operation (empty) node"
-  );
-  return cudaTask(node);
-}
-
-// Function: kernel
-template <typename F, typename... ArgsT>
-cudaTask cudaFlow::kernel(
-  dim3 grid, dim3 block, size_t shm, F&& func, ArgsT&&... args
-) {
-  
-  using traits = function_traits<F>;
-
-  static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches");
-
-  void* arguments[sizeof...(ArgsT)] = { (void*)(&args)... };
-
-  auto node = _graph.emplace_back();
-
-  auto& p = node->_handle.emplace<cudaNode::Kernel>().param;
-
-  p.func = (void*)func;
-  p.gridDim = grid;
-  p.blockDim = block;
-  p.sharedMemBytes = shm;
-  p.kernelParams = arguments;
-  p.extra = nullptr;
-  
-  TF_CHECK_CUDA(
-    ::cudaGraphAddKernelNode(&node->_node, _graph._handle, nullptr, 0, &p),
-    "failed to create a cudaKernel node"
-  );
-  
-  return cudaTask(node);
-}
-
-// Function: copy
-template <
-  typename T,
-  std::enable_if_t<!std::is_same<T, void>::value, void>*
->
-cudaTask cudaFlow::copy(T* tgt, T* src, size_t num) {
-
-  using U = std::decay_t<T>;
-
-  auto node = _graph.emplace_back();
-  auto& p = node->_handle.emplace<cudaNode::Copy>().param;
-
-  p.srcArray = nullptr;
-  p.srcPos = ::make_cudaPos(0, 0, 0);
-  p.srcPtr = ::make_cudaPitchedPtr(src, num*sizeof(U), num, 1);
-  p.dstArray = nullptr;
-  p.dstPos = ::make_cudaPos(0, 0, 0);
-  p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
-  p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
-  p.kind = cudaMemcpyDefault;
-
-  TF_CHECK_CUDA(
-    cudaGraphAddMemcpyNode(&node->_node, _graph._handle, nullptr, 0, &p),
-    "failed to create a cudaCopy node"
-  );
-
-  return cudaTask(node);
-}
-
-}  // end of namespace tf -----------------------------------------------------
diff --git a/taskflow/cuda/graph.hpp b/taskflow/cuda/graph.hpp
deleted file mode 100644
index f6dda2590..000000000
--- a/taskflow/cuda/graph.hpp
+++ /dev/null
@@ -1,204 +0,0 @@
-#pragma once
-
-#include "device.hpp"
-
-#include "../utility/object_pool.hpp"
-#include "../utility/traits.hpp"
-#include "../utility/passive_vector.hpp"
-#include "../nstd/variant.hpp"
-
-namespace tf {
-
-// ----------------------------------------------------------------------------
-// cudaNode class
-// ----------------------------------------------------------------------------
-
-// class: cudaNode
-class cudaNode {
-  
-  friend class cudaFlow;
-  friend class cudaGraph;
-  friend class cudaTask;
-
-  friend class Taskflow;
-  friend class Executor;
-
-  // Host handle
-  //struct Host {
-  //  cudaHostNodeParams param;
-  //};
-
-  struct Noop {
-
-  };
-
-  // Copy handle
-  struct Copy {
-    
-    template <typename... ArgsT>
-    Copy(ArgsT&&...);
-
-    cudaMemcpy3DParms param;
-  };
-  
-  // Kernel handle
-  struct Kernel {
-    
-    template <typename... ArgsT>
-    Kernel(ArgsT&&...);
-
-    cudaKernelNodeParams param;
-  };
-
-  using handle_t = nstd::variant<nstd::monostate, Noop, Copy, Kernel>;
-  
-  // variant index
-  constexpr static auto NOOP = get_index_v<Noop, handle_t>;
-  constexpr static auto COPY = get_index_v<Copy, handle_t>; 
-  constexpr static auto KERNEL = get_index_v<Kernel, handle_t>;
-
-  public:
-  
-    cudaNode(cudaGraph&);
-
-  private:
-
-    cudaGraph& _graph;
-
-    std::string _name;
-    
-    handle_t _handle;
-
-    cudaGraphNode_t _node {nullptr};
-
-    PassiveVector<cudaNode*> _successors;
-
-    void _precede(cudaNode*);
-};
-
-// ----------------------------------------------------------------------------
-// cudaGraph class
-// ----------------------------------------------------------------------------
-
-// class: cudaGraph
-class cudaGraph {
-
-  friend class cudaFlow;
-  friend class cudaNode;
-  friend class cudaTask;
-  
-  friend class Taskflow;
-  friend class Executor;
-
-  public:
-
-    cudaGraph();
-    ~cudaGraph();
-
-    template <typename... ArgsT>
-    cudaNode* emplace_back(ArgsT&&...);
-
-    cudaGraph_t native_handle();
-
-    void clear();
-
-    bool empty() const;
-
-  private:
-    
-    cudaGraph_t _handle {nullptr};
-
-    std::vector<std::unique_ptr<cudaNode>> _nodes;
-};
-
-// ----------------------------------------------------------------------------
-// cudaNode definitions
-// ----------------------------------------------------------------------------
-
-// Copy handle constructor
-template <typename... ArgsT>
-cudaNode::Copy::Copy(ArgsT&&... args) {
-}
-
-// Kernel handle constructor
-template <typename... ArgsT>
-cudaNode::Kernel::Kernel(ArgsT&&... args) {
-}
-
-// Constructor
-inline cudaNode::cudaNode(cudaGraph& g) : _graph {g} {
-}
-
-// Procedure: _precede
-inline void cudaNode::_precede(cudaNode* v) {
-  _successors.push_back(v);
-  TF_CHECK_CUDA(
-    ::cudaGraphAddDependencies(_graph._handle, &_node, &(v->_node), 1),
-    "failed to add a preceding link"
-  );
-}
-
-// ----------------------------------------------------------------------------
-// cudaGraph definitions
-// ----------------------------------------------------------------------------
-
-// Constructor
-inline cudaGraph::cudaGraph() {
-  TF_CHECK_CUDA(cudaGraphCreate(&_handle, 0), "failed to create a cudaGraph");
-}
-
-// Destructor
-inline cudaGraph::~cudaGraph() {
-  cudaGraphDestroy(_handle);
-}
-
-// Function: empty
-inline bool cudaGraph::empty() const {
-  return _nodes.empty();
-}
-
-// Procedure: clear
-inline void cudaGraph::clear() {
-
-  _nodes.clear();
-
-  cudaGraphDestroy(_handle);
-  TF_CHECK_CUDA(
-    cudaGraphCreate(&_handle, 0), 
-    "failed to create a cudaGraph after clear"
-  );
-}
-    
-// Function: emplace_back
-template <typename... ArgsT>
-cudaNode* cudaGraph::emplace_back(ArgsT&&... args) {
-  auto node = std::make_unique<cudaNode>(*this, std::forward<ArgsT>(args)...);
-  _nodes.emplace_back(std::move(node));
-  return _nodes.back().get();
-}
-
-// Function: native_handle
-inline cudaGraph_t cudaGraph::native_handle() {
-  return _handle;
-}
-
-
-//inline void cudaGraph::run() {
-//  cudaGraphExec_t graphExec;
-//  TF_CHECK_CUDA(
-//    cudaGraphInstantiate(&graphExec, _handle, nullptr, nullptr, 0),
-//    "failed to create an executable cudaGraph"
-//  );
-//  TF_CHECK_CUDA(cudaGraphLaunch(graphExec, 0), "failed to launch cudaGraph")
-//  TF_CHECK_CUDA(cudaStreamSynchronize(0), "failed to sync cudaStream");
-//  TF_CHECK_CUDA(
-//    cudaGraphExecDestroy(graphExec), "failed to destroy an executable cudaGraph"
-//  );
-//}
-
-
-
-
-
-}  // end of namespace tf -----------------------------------------------------
-
diff --git a/taskflow/cuda/task.hpp b/taskflow/cuda/task.hpp
deleted file mode 100644
index ae0bf1cc6..000000000
--- a/taskflow/cuda/task.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-#pragma once
-
-#include "graph.hpp"
-
-namespace tf {
-
-/**
-@class Task
-
-@brief task handle to a node in a cudaGraph
-*/
-class cudaTask {
-
-  friend class cudaFlow;
-
-  public:
-    
-    /**
-    @brief constructs an empty cudaTask
-    */
-    cudaTask() = default;
-
-    /**
-    @brief copy-constructs a cudaTask
-    */
-    cudaTask(const cudaTask&) = default;
-
-    /**
-    @brief copy-assigns a cudaTask
-    */
-    cudaTask& operator = (const cudaTask&) = default;
-
-    /**
-    @brief adds precedence links from this to other tasks
-
-    @tparam Ts... parameter pack
-
-    @param tasks one or multiple tasks
-
-    @return @c *this
-    */
-    template <typename... Ts>
-    cudaTask& precede(Ts&&... tasks);
-    
-    /**
-    @brief adds precedence links from other tasks to this
-
-    @tparam Ts... parameter pack
-
-    @param tasks one or multiple tasks
-
-    @return @c *this
-    */
-    template <typename... Ts>
-    cudaTask& succeed(Ts&&... tasks);
-    
-    /**
-    @brief assigns a name to the task
-
-    @param name a @std_string acceptable string
-
-    @return @c *this
-    */
-    cudaTask& name(const std::string& name);
-    
-    /**
-    @brief queries the name of the task
-    */
-    const std::string& name() const;
-
-    /**
-    @brief queries the number of successors
-    */
-    size_t num_successors() const;
-
-    /**
-    @brief queries if the task is associated with a cudaNode
-    */
-    bool empty() const;
-
-  private:
-    
-    cudaTask(cudaNode*);
-
-    cudaNode* _node {nullptr};
-
-    template <typename T>
-    void _precede(T&&);
-
-    template <typename T, typename... Ts>
-    void _precede(T&&, Ts&&...);
-    
-    template <typename T>
-    void _succeed(T&&);
-
-    template <typename T, typename... Ts>
-    void _succeed(T&&, Ts&&...);
-};
-
-// Constructor
-inline cudaTask::cudaTask(cudaNode* node) : _node {node} {
-}
-
-// Function: precede
-template <typename... Ts>
-cudaTask& cudaTask::precede(Ts&&... tasks) {
-  _precede(std::forward<Ts>(tasks)...);
-  return *this;
-}
-
-// Procedure: precede
-template <typename T>
-void cudaTask::_precede(T&& other) {
-  _node->_precede(other._node);
-}
-
-// Procedure: _precede
-template <typename T, typename... Ts>
-void cudaTask::_precede(T&& task, Ts&&... others) {
-  _precede(std::forward<T>(task));
-  _precede(std::forward<Ts>(others)...);
-}
-
-// Function: succeed
-template <typename... Ts>
-cudaTask& cudaTask::succeed(Ts&&... tasks) {
-  _succeed(std::forward<Ts>(tasks)...);
-  return *this;
-}
-
-// Procedure: succeed
-template <typename T>
-void cudaTask::_succeed(T&& other) {
-  other._node->_precede(_node);
-}
-
-// Procedure: _succeed
-template <typename T, typename... Ts>
-void cudaTask::_succeed(T&& task, Ts&&... others) {
-  _succeed(std::forward<T>(task));
-  _succeed(std::forward<Ts>(others)...);
-}
-
-// Function: empty
-inline bool cudaTask::empty() const {
-  return _node == nullptr;
-}
-
-// Function: name
-inline cudaTask& cudaTask::name(const std::string& name) {
-  _node->_name = name;
-  return *this;
-}
-
-// Function: name
-inline const std::string& cudaTask::name() const {
-  return _node->_name;
-}
-
-// Function: num_successors
-inline size_t cudaTask::num_successors() const {
-  return _node->_successors.size();
-}
-
-//// Function: kernel
-//template <typename F, typename... ArgsT>
-//cudaTask& cudaTask::kernel(
-//  dim3 grid, dim3 block, size_t shm, F&& func, ArgsT&&... args
-//) {
-//
-//  using traits = function_traits<F>;
-//
-//  static_assert(traits::arity == sizeof...(ArgsT), "arity mismatches");
-//
-//  void* arguments[sizeof...(ArgsT)] = { &args... };
-//
-//  auto& p = _node->_handle.emplace<cudaNode::Kernel>().param;
-//
-//  p.func = (void*)func;
-//  p.gridDim = grid;
-//  p.blockDim = block;
-//  p.sharedMemBytes = shm;
-//  p.kernelParams = arguments;
-//  p.extra = nullptr;
-//  
-//  TF_CHECK_CUDA(
-//    ::cudaGraphAddKernelNode(&_node->_node, _node->_graph._handle, nullptr, 0, &p),
-//    "failed to create a cudaKernel node"
-//  );
-//  
-//  return *this;
-//}
-//
-//// Function: copy
-//template <
-//  typename T,
-//  std::enable_if_t<!std::is_same<T, void>::value, void>*
-//>
-//cudaTask& cudaTask::copy(T* tgt, T* src, size_t num) {
-//
-//  using U = std::decay_t<T>;
-//
-//  auto& p = _node->_handle.emplace<cudaNode::Copy>().param;
-//
-//  p.srcArray = nullptr;
-//  p.srcPos = ::make_cudaPos(0, 0, 0);
-//  p.srcPtr = ::make_cudaPitchedPtr(src, num*sizeof(U), num, 1);
-//  p.dstArray = nullptr;
-//  p.dstPos = ::make_cudaPos(0, 0, 0);
-//  p.dstPtr = ::make_cudaPitchedPtr(tgt, num*sizeof(U), num, 1);
-//  p.extent = ::make_cudaExtent(num*sizeof(U), 1, 1);
-//  p.kind = cudaMemcpyDefault;
-//
-//  TF_CHECK_CUDA(
-//    cudaGraphAddMemcpyNode(&_node->_node, _node->_graph._handle, nullptr, 0, &p),
-//    "failed to create a cudaCopy node"
-//  );
-//
-//  return *this;
-//}
-
-}  // end of namespace tf -----------------------------------------------------
diff --git a/unittests/cuda/basics.cu b/unittests/cuda/basics.cu
deleted file mode 100644
index 737c2cdaf..000000000
--- a/unittests/cuda/basics.cu
+++ /dev/null
@@ -1,306 +0,0 @@
-#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
-
-#include <doctest.h>
-#include <taskflow/taskflow.hpp>
-
-// ----------------------------------------------------------------------------
-// kernel helper
-// ----------------------------------------------------------------------------
-template <typename T>
-__global__ void k_set(T* ptr, size_t N, T value) {
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  if (i < N) {
-    ptr[i] = value;
-  }
-}
-
-template <typename T>
-__global__ void k_single_set(T* ptr, int i, T value) {
-  ptr[i] = value;
-}
-
-template <typename T>
-__global__ void k_add(T* ptr, size_t N, T value) {
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  if (i < N) {
-    ptr[i] += value;
-  }
-}
-
-template <typename T>
-__global__ void k_single_add(T* ptr, int i, T value) {
-  ptr[i] += value;
-}
-
-// --------------------------------------------------------
-// standalone execution
-// --------------------------------------------------------
-void run(tf::cudaGraph& G) {
-  cudaGraphExec_t graphExec;
-  TF_CHECK_CUDA(
-    cudaGraphInstantiate(&graphExec, G.native_handle(), nullptr, nullptr, 0),
-    "failed to create an executable cudaGraph"
-  );
-  TF_CHECK_CUDA(cudaGraphLaunch(graphExec, 0), "failed to launch cudaGraph")
-  TF_CHECK_CUDA(cudaStreamSynchronize(0), "failed to sync cudaStream");
-  TF_CHECK_CUDA(
-    cudaGraphExecDestroy(graphExec), "failed to destroy an executable cudaGraph"
-  );
-}
-
-// --------------------------------------------------------
-// Testcase: Builder
-// --------------------------------------------------------
-TEST_CASE("Builder" * doctest::timeout(300)) {
-
-  tf::cudaGraph G;
-  tf::cudaFlow cf(G);
-
-  int source = 1;
-  int target = 1;
-
-  auto copy1 = cf.copy(&target, &source, 1).name("copy1");
-  auto copy2 = cf.copy(&target, &source, 1).name("copy2");
-  auto copy3 = cf.copy(&target, &source, 1).name("copy3");
-
-  REQUIRE(copy1.name() == "copy1");
-  REQUIRE(copy2.name() == "copy2");
-  REQUIRE(copy3.name() == "copy3");
-
-  REQUIRE(!copy1.empty());
-  REQUIRE(!copy2.empty());
-  REQUIRE(!copy3.empty());
-  
-  copy1.precede(copy2);
-  copy2.succeed(copy3);
-
-  REQUIRE(copy1.num_successors() == 1);
-  REQUIRE(copy2.num_successors() == 0);
-  REQUIRE(copy3.num_successors() == 1);
-}
-
-// --------------------------------------------------------
-// Testcase: Set
-// --------------------------------------------------------
-template <typename T>
-void set() {
-
-  for(unsigned n=1; n<=1345678; n = n*2 + 1) {
-    tf::cudaGraph G;
-    tf::cudaFlow cf(G);
-    
-    T* cpu = nullptr;
-    T* gpu = nullptr;
-
-    cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
-    REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
-
-    dim3 g = {(n+255)/256, 1, 1};
-    dim3 b = {256, 1, 1};
-    auto h2d = cf.copy(gpu, cpu, n);
-    auto kernel = cf.kernel(g, b, 0, k_set<T>, gpu, n, (T)17);
-    auto d2h = cf.copy(cpu, gpu, n);
-    h2d.precede(kernel);
-    kernel.precede(d2h);
-    
-    run(G);
-
-    for(unsigned i=0; i<n; ++i) {
-      REQUIRE(cpu[i] == (T)17);
-    }
-
-    std::free(cpu);
-    REQUIRE(cudaFree(gpu) == cudaSuccess);
-  }
-}
-
-TEST_CASE("Set.i8" * doctest::timeout(300)) {
-  set<int8_t>();
-}
-
-TEST_CASE("Set.i16" * doctest::timeout(300)) {
-  set<int16_t>();
-}
-
-TEST_CASE("Set.i32" * doctest::timeout(300)) {
-  set<int32_t>();
-}
-
-// --------------------------------------------------------
-// Testcase: Add
-// --------------------------------------------------------
-template <typename T>
-void add() {
-
-  for(unsigned n=1; n<=1345678; n = n*2 + 1) {
-    tf::cudaGraph G;
-    tf::cudaFlow cf(G);
-    
-    T* cpu = nullptr;
-    T* gpu = nullptr;
-
-    cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
-    REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
-
-    dim3 g = {(n+255)/256, 1, 1};
-    dim3 b = {256, 1, 1};
-    auto h2d = cf.copy(gpu, cpu, n);
-    auto ad1 = cf.kernel(g, b, 0, k_add<T>, gpu, n, 1);
-    auto ad2 = cf.kernel(g, b, 0, k_add<T>, gpu, n, 2);
-    auto ad3 = cf.kernel(g, b, 0, k_add<T>, gpu, n, 3);
-    auto ad4 = cf.kernel(g, b, 0, k_add<T>, gpu, n, 4);
-    auto d2h = cf.copy(cpu, gpu, n);
-    h2d.precede(ad1);
-    ad1.precede(ad2);
-    ad2.precede(ad3);
-    ad3.precede(ad4);
-    ad4.precede(d2h);
-    
-    run(G);
-
-    for(unsigned i=0; i<n; ++i) {
-      REQUIRE(cpu[i] == 10);
-    }
-
-    std::free(cpu);
-    REQUIRE(cudaFree(gpu) == cudaSuccess);
-  }
-}
-
-TEST_CASE("Add.i8" * doctest::timeout(300)) {
-  add<int8_t>();
-}
-
-TEST_CASE("Add.i16" * doctest::timeout(300)) {
-  add<int16_t>();
-}
-
-TEST_CASE("Add.i32" * doctest::timeout(300)) {
-  add<int32_t>();
-}
-
-// TODO: 64-bit fail?
-//TEST_CASE("Add.i64" * doctest::timeout(300)) {
-//  add<int64_t>();
-//}
-
-
-// --------------------------------------------------------
-// Testcase: Binary Set
-// --------------------------------------------------------
-template <typename T>
-void bset() {
-
-  const unsigned n = 10000;
-
-  tf::cudaGraph G;
-  tf::cudaFlow cf(G);
-  
-  T* cpu = nullptr;
-  T* gpu = nullptr;
-
-  cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
-  REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
-
-  dim3 g = {1, 1, 1};
-  dim3 b = {1, 1, 1};
-  auto h2d = cf.copy(gpu, cpu, n);
-  auto d2h = cf.copy(cpu, gpu, n);
-
-  std::vector<tf::cudaTask> tasks(n+1);
-
-  for(unsigned i=1; i<=n; ++i) {
-    tasks[i] = cf.kernel(g, b, 0, k_single_set<T>, gpu, i-1, (T)17);
-
-    auto p = i/2;
-    if(p != 0) {
-      tasks[p].precede(tasks[i]);
-    }
-
-    tasks[i].precede(d2h);
-    h2d.precede(tasks[i]);
-  }
-  
-  run(G);
-
-  for(unsigned i=0; i<n; ++i) {
-    REQUIRE(cpu[i] == (T)17);
-  }
-
-  std::free(cpu);
-  REQUIRE(cudaFree(gpu) == cudaSuccess);
-}
-
-TEST_CASE("BSet.i8" * doctest::timeout(300)) {
-  bset<int8_t>();
-}
-
-TEST_CASE("BSet.i16" * doctest::timeout(300)) {
-  bset<int16_t>();
-}
-
-TEST_CASE("BSet.i32" * doctest::timeout(300)) {
-  bset<int32_t>();
-}
-
-// --------------------------------------------------------
-// Testcase: Barrier
-// --------------------------------------------------------
-template <typename T>
-void barrier() {
-
-  const unsigned n = 1000;
-
-  tf::cudaGraph G;
-  tf::cudaFlow cf(G);
-  
-  T* cpu = nullptr;
-  T* gpu = nullptr;
-
-  cpu = static_cast<T*>(std::calloc(n, sizeof(T)));
-  REQUIRE(cudaMalloc(&gpu, n*sizeof(T)) == cudaSuccess);
-
-  dim3 g = {1, 1, 1};
-  dim3 b = {1, 1, 1};
-  auto br1 = cf.noop();
-  auto br2 = cf.noop();
-  auto br3 = cf.noop();
-  auto h2d = cf.copy(gpu, cpu, n);
-  auto d2h = cf.copy(cpu, gpu, n);
-
-  h2d.precede(br1);
-
-  for(unsigned i=0; i<n; ++i) {
-    auto k1 = cf.kernel(g, b, 0, k_single_set<T>, gpu, i, (T)17);
-    k1.succeed(br1)
-      .precede(br2);
-
-    auto k2 = cf.kernel(g, b, 0, k_single_add<T>, gpu, i, (T)3);
-    k2.succeed(br2)
-      .precede(br3);
-  }
-
-  br3.precede(d2h);
-  
-  run(G);
-
-  for(unsigned i=0; i<n; ++i) {
-    REQUIRE(cpu[i] == (T)20);
-  }
-
-  std::free(cpu);
-  REQUIRE(cudaFree(gpu) == cudaSuccess);
-}
-
-TEST_CASE("Barrier.i8" * doctest::timeout(300)) {
-  barrier<int8_t>();
-}
-
-TEST_CASE("Barrier.i16" * doctest::timeout(300)) {
-  barrier<int16_t>();
-}
-
-TEST_CASE("Barrier.i32" * doctest::timeout(300)) {
-  barrier<int32_t>();
-}
-