Bring back MXNET_GPU_COPY_NTHREADS env variable (apache#11237)

* bring back MXNET_GPU_COPY_NTHREADS env variable * make variable size_t for consistency
iblislin · Jun 18, 2018 · 0ac9f8b · 0ac9f8b
1 parent 92fde19
commit 0ac9f8b
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 2 deletions.
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
@@ -14,7 +14,7 @@ export MXNET_GPU_WORKER_NTHREADS=3
   - Values: Int ```(default=2)```
   - The maximum number of threads to use on each GPU. This parameter is used to parallelize the computation within a single GPU card.
 * MXNET_GPU_COPY_NTHREADS
-  - Values: Int ```(default=1)```
+  - Values: Int ```(default=2)```
   - The maximum number of concurrent threads that do the memory copy job on each GPU.
 * MXNET_CPU_WORKER_NTHREADS
   - Values: Int ```(default=1)```

diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
@@ -76,6 +76,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     if (is_worker_) return;
     gpu_worker_nthreads_ = common::GetNumThreadsPerGPU();
     cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+    gpu_copy_nthreads_ = dmlc::GetEnv("MXNET_GPU_COPY_NTHREADS", 2);
     // create CPU task
     int cpu_priority_nthreads = dmlc::GetEnv("MXNET_CPU_PRIORITY_NTHREADS", 4);
     cpu_priority_worker_.reset(new ThreadWorkerBlock<kPriorityQueue>());
@@ -128,8 +129,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
         const FnProperty prop = opr_block->opr->prop;
         const bool is_copy = (prop == FnProperty::kCopyFromGPU ||
                               prop == FnProperty::kCopyToGPU);
-        const size_t nthread = gpu_worker_nthreads_;
         if (is_copy) {
+          const size_t nthread = gpu_copy_nthreads_;
           auto ptr = gpu_copy_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
             // Signify to kernel that GPU is being used, so reserve cores as necessary
             OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
@@ -150,6 +151,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
             }
           }
         } else {
+          const size_t nthread = gpu_worker_nthreads_;
           auto ptr = gpu_normal_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
             // Signify to kernel that GPU is being used, so reserve cores as necessary
             OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
@@ -194,6 +196,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   size_t cpu_worker_nthreads_;
   /*! \brief number of concurrent thread each gpu worker uses */
   size_t gpu_worker_nthreads_;
+  /*! \brief number of concurrent thread each gpu copy worker uses */
+  size_t gpu_copy_nthreads_;
   // cpu worker
   common::LazyAllocArray<ThreadWorkerBlock<kWorkerQueue> > cpu_normal_workers_;
   // cpu priority worker