Refine CPU version for ParallelExecutor

chengduoZH · chengduoZH · commit 053ecd69912e · 2018-06-26T21:19:04.000+08:00
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -27,20 +27,16 @@ AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
                                      const platform::NCCLContextMap *ctxs)
     : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
-  use_cuda_ = false;
   if (nccl_ctxs_) {
     for (auto &p : places_) {
       this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
     }
-    use_cuda_ = true;
   }
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {
-  use_cuda_ = false;
-}
+    : local_scopes_(local_scopes), places_(places) {}
 #endif
 
 void AllReduceOpHandle::RunImpl() {
@@ -117,28 +113,18 @@ void AllReduceOpHandle::RunImpl() {
       // Reduce All Tensor to trg in CPU
       ReduceLoDTensor func(lod_tensors, &trg);
       VisitDataType(ToDataType(lod_tensors[0]->type()), func);
-      bool use_cuda = use_cuda_;
+
       for (size_t i = 1; i < local_scopes_.size(); ++i) {
         auto &scope =
             *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
         auto &p = places_[i];
         auto *var = scope.FindVar(out_var_handles[i]->name_);
         auto *dev_ctx = dev_ctxes_[p];
 
-        RunAndRecordEvent(p, [&trg, var, dev_ctx, p, use_cuda] {
-#ifdef PADDLE_WITH_CUDA
-          if (use_cuda) {
-            auto &tensor_dst = *var->GetMutable<framework::LoDTensor>();
-            auto &tensor_src = trg;
-            TensorCopy(tensor_src, p, *dev_ctx, &tensor_dst);
-          } else {
-            auto &tensor_dst = *var->GetMutable<framework::LoDTensor>();
-            tensor_dst.ShareDataWith(trg);
-          }
-#else
-          auto &tensor_dst = *var->GetMutable<framework::LoDTensor>();
-          tensor_dst.ShareDataWith(trg);
-#endif
+        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+          auto &tensor_cpu = trg;
+          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
         });
       }
     }
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -52,7 +52,6 @@ struct AllReduceOpHandle : public OpHandleBase {
 #ifdef PADDLE_WITH_CUDA
   const platform::NCCLContextMap *nccl_ctxs_;
 #endif
-  bool use_cuda_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -260,13 +260,21 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     }
   }
 
-  // Insert BCast Ops
-  for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
-    auto &to_bcast_set = bcast_var_name_set[dev_id];
-    for (auto &bcast_name : to_bcast_set) {
-      CreateBroadcastOp(&result, bcast_name, dev_id);
+  bool use_gpu = false;
+#ifdef PADDLE_WITH_CUDA
+  use_gpu = nccl_ctxs_ != nullptr;
+#endif
+
+  if (use_gpu) {
+    // Insert BCast Ops
+    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+      auto &to_bcast_set = bcast_var_name_set[dev_id];
+      for (auto &bcast_name : to_bcast_set) {
+        CreateBroadcastOp(&result, bcast_name, dev_id);
+      }
     }
   }
+
   /*
     Dependency graph has been constructed. However, there are still data
     hazards need to be handled.
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
@@ -95,7 +95,7 @@ ParallelExecutor::ParallelExecutor(
   }
 
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToGPUs(bcast_vars, member_->use_cuda_);
+    BCastParamsToGPUs(bcast_vars);
   }
   // Startup Program has been run. All local scopes has correct parameters.
 
@@ -132,9 +132,7 @@ ParallelExecutor::ParallelExecutor(
 }
 
 void ParallelExecutor::BCastParamsToGPUs(
-    const std::unordered_set<std::string> &vars, const bool use_cuda) const {
-  auto *main_scope = member_->local_scopes_[0];
-
+    const std::unordered_set<std::string> &vars) const {
   // the the initialize bcast, all vars would be bcast from device(0), otherwise
   // bcast from the specified device.
   bool initialize = builder_.get() == nullptr ? true : false;
@@ -156,12 +154,11 @@ void ParallelExecutor::BCastParamsToGPUs(
     }
 
     auto &main_tensor = main_var->Get<LoDTensor>();
-#ifdef PADDLE_WITH_CUDA
-    auto &dims = main_tensor.dims();
-#endif
+
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
       std::vector<void *> buffers;
+      auto &dims = main_tensor.dims();
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
       for (size_t i = 0; i < member_->places_.size(); ++i) {
@@ -200,7 +197,8 @@ void ParallelExecutor::BCastParamsToGPUs(
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
 #ifdef PADDLE_WITH_CUDA
-        if (use_cuda) {
+        if (member_->use_cuda_) {
+          auto &dims = main_tensor.dims();
           t->Resize(dims);
           t->mutable_data(cpu, main_tensor.type());
           paddle::framework::TensorCopy(main_tensor, cpu, t);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
@@ -66,8 +66,7 @@ class ParallelExecutor {
   void Run(const std::vector<std::string> &fetch_tensors,
            const std::string &fetched_var_name);
 
-  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars,
-                         const bool use_cuda) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
 
  private:
   ParallelExecutorPrivate *member_;
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -45,6 +45,8 @@ def run_executor(exe, feed, fetch_list, program=None):
                 raise ValueError('Unkown type exe')
             return res
 
+        if not use_cuda:
+            balance_parameter_opt_between_cards = True
         main = fluid.Program()
         startup = fluid.Program()
         startup.random_seed = 1  # Fix random seed