PaddlePaddle
diff --git a/‎Dockerfile‎
Lines changed: 0 additions & 1 deletion b/‎Dockerfile‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cmake/external/ngraph.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/external/ngraph.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/generic.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/generic.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/API.spec‎
Lines changed: 73 additions & 70 deletions b/‎paddle/fluid/API.spec‎
Lines changed: 73 additions & 70 deletions
diff --git a/‎paddle/fluid/framework/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/framework/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/details/all_reduce_op_handle.cc‎
Lines changed: 8 additions & 23 deletions b/‎paddle/fluid/framework/details/all_reduce_op_handle.cc‎
Lines changed: 8 additions & 23 deletions
diff --git a/‎paddle/fluid/framework/details/all_reduce_op_handle.h‎
Lines changed: 12 additions & 5 deletions b/‎paddle/fluid/framework/details/all_reduce_op_handle.h‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎paddle/fluid/framework/details/async_ssa_graph_executor.cc‎
Lines changed: 30 additions & 36 deletions b/‎paddle/fluid/framework/details/async_ssa_graph_executor.cc‎
Lines changed: 30 additions & 36 deletions
diff --git a/‎paddle/fluid/framework/details/build_strategy.cc‎
Lines changed: 21 additions & 12 deletions b/‎paddle/fluid/framework/details/build_strategy.cc‎
Lines changed: 21 additions & 12 deletions
diff --git a/‎paddle/fluid/framework/details/build_strategy.h‎
Lines changed: 12 additions & 1 deletion b/‎paddle/fluid/framework/details/build_strategy.h‎
Lines changed: 12 additions & 1 deletion
@@ -1,7 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
 # When you modify it, please be aware of cudnn-runtime version
-# and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "127e0dedfaac8c6f2b148cc03bf5f67ac5fbe6fe")
+SET(NGRAPH_GIT_TAG         "096ad6ef0c04d57db1522940dbdf9a0652768065")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 
@@ -385,7 +385,7 @@ function(cc_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true ${MKL_DEBUG_FLAG})
     # No unit test should exceed 10 minutes.
     set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
 
@@ -225,6 +225,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 
 cc_test(tuple_test SRCS tuple_test.cc )
 
+cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
+
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
 
@@ -35,16 +35,9 @@ namespace details {
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
-                                     const platform::NCCLContextMap *ctxs)
-    : OpHandleBase(node),
-      local_scopes_(local_scopes),
-      places_(places),
-      nccl_ctxs_(ctxs) {
-  if (nccl_ctxs_) {
-    for (auto &p : places_) {
-      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
-    }
-  }
+                                     const platform::MultiNCCLContextMap *ctxs)
+    : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -71,7 +64,9 @@ void AllReduceOpHandle::RunAllReduceFuncs(
   if (FLAGS_sync_nccl_allreduce) {
     for (auto &p : places_) {
       int dev_id = boost::get<platform::CUDAPlace>(p).device;
-      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto *nccl_ctxs =
+          nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
+      auto &nccl_ctx = nccl_ctxs->at(dev_id);
       auto stream = nccl_ctx.stream();
       cudaError_t e_sync = cudaStreamSynchronize(stream);
       if (e_sync != 0) {
@@ -134,19 +129,9 @@ void AllReduceOpHandle::RunImpl() {
         numel = static_cast<size_t>(lod_tensor.numel());
       }
 
-      int dev_id = boost::get<platform::CUDAPlace>(p).device;
-      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-      auto stream = nccl_ctx.stream();
-      auto comm = nccl_ctx.comm_;
-
-      VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel
-               << ", dev_id:" << dev_id << ", dtype:" << dtype
-               << ", place:" << p;
-
       all_reduce_calls.emplace_back([=] {
-        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
-            comm, stream));
+        NCCLAllReduce(p, buffer, buffer, numel,
+                      static_cast<ncclDataType_t>(dtype), ncclSum);
       });
     }
     RunAllReduceFuncs(all_reduce_calls);
 
@@ -21,20 +21,23 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-class AllReduceOpHandle : public OpHandleBase {
- public:
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+class AllReduceOpHandle : public NCCLOpHandleBase {
+ public:
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
-                    const platform::NCCLContextMap *ctxs);
+                    const platform::MultiNCCLContextMap *ctxs);
 #else
+class AllReduceOpHandle : public OpHandleBase {
+ public:
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
 #endif
@@ -46,13 +49,17 @@ class AllReduceOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-
   std::vector<Scope *> local_scopes_;
+
+#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
+  // NCCLOpHandleBase already have these attributes.
+  // Will polish it by class inheritance framework.
   std::vector<platform::Place> places_;
+#endif
+
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   void RunAllReduceFuncs(
       const std::vector<std::function<void()>> &all_reduce_calls);
-  const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 };
 
 
@@ -51,45 +51,39 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
   VLOG(3) << "ProcessGraph";
   RpcCtxMap send_varname_to_ctx;
   RpcCtxMap recv_varname_to_ctx;
-  for (auto i = 0; i < graphs.size(); ++i) {
-    std::vector<ir::Node *> nodes_to_delete;
-    for (auto &node : graphs[i]->Nodes()) {
-      VLOG(3) << "node name " << node->Name();
-      if (node && node->IsOp()) {
-        if (node->Name() == "send") {
-          auto send_var_name = node->Op()->Input("X")[0];
-          auto send_varnames = boost::get<std::vector<std::string>>(
-              node->Op()->GetNullableAttr("send_varnames"));
-          auto epmap = boost::get<std::vector<std::string>>(
-              node->Op()->GetNullableAttr("epmap"));
-          auto height_section = boost::get<std::vector<int64_t>>(
-              node->Op()->GetNullableAttr("sections"));
-          auto trainer_id =
-              boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
-          send_varname_to_ctx[send_var_name] =
-              operators::distributed::RpcContext(send_var_name, send_varnames,
-                                                 epmap, height_section,
-                                                 trainer_id);
-          VLOG(3) << "find and init an send op: "
-                  << send_varname_to_ctx[send_var_name];
-        } else if (node->Name() == "recv") {
-          auto recv_var_name = node->Op()->Output("Out")[0];
-          auto recv_varnames = boost::get<std::vector<std::string>>(
-              node->Op()->GetNullableAttr("recv_varnames"));
-          auto epmap = boost::get<std::vector<std::string>>(
-              node->Op()->GetNullableAttr("epmap"));
-          auto trainer_id =
-              boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
-          recv_varname_to_ctx[recv_var_name] =
-              operators::distributed::RpcContext(recv_var_name, recv_varnames,
-                                                 epmap, {}, trainer_id);
-          nodes_to_delete.push_back(node);
-          VLOG(3) << "find and remove an recv op: "
-                  << recv_varname_to_ctx[recv_var_name];
-        }
+  for (auto &node : graphs[0]->Nodes()) {
+    VLOG(3) << "node name " << node->Name();
+    if (node && node->IsOp()) {
+      if (node->Name() == "send") {
+        auto send_var_name = node->Op()->Input("X")[0];
+        auto send_varnames = boost::get<std::vector<std::string>>(
+            node->Op()->GetNullableAttr("send_varnames"));
+        auto epmap = boost::get<std::vector<std::string>>(
+            node->Op()->GetNullableAttr("epmap"));
+        auto height_section = boost::get<std::vector<int64_t>>(
+            node->Op()->GetNullableAttr("sections"));
+        auto trainer_id =
+            boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
+        send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(
+            send_var_name, send_varnames, epmap, height_section, trainer_id);
+        VLOG(3) << "find and init an send op: "
+                << send_varname_to_ctx[send_var_name];
+      } else if (node->Name() == "recv") {
+        auto recv_var_name = node->Op()->Output("Out")[0];
+        auto recv_varnames = boost::get<std::vector<std::string>>(
+            node->Op()->GetNullableAttr("recv_varnames"));
+        auto epmap = boost::get<std::vector<std::string>>(
+            node->Op()->GetNullableAttr("epmap"));
+        auto trainer_id =
+            boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
+        recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(
+            recv_var_name, recv_varnames, epmap, {}, trainer_id);
+        VLOG(3) << "find and remove an recv op: "
+                << recv_varname_to_ctx[recv_var_name];
       }
     }
   }
+
   // init communicator here
   if (send_varname_to_ctx.size() > 0) {
     VLOG(3) << "this is distribute mode, will use communicator";
 
@@ -256,16 +256,14 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
   return framework::ir::MultiDevSSAGraphBuilder().count(pass_name) > 0;
 }
 
-ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
-                                const std::vector<platform::Place> &places,
-                                const std::string &loss_var_name,
-                                const std::vector<Scope *> &local_scopes,
-                                const size_t &nranks,
+ir::Graph *BuildStrategy::Apply(
+    ir::Graph *graph, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
+    const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-                                const bool use_cuda,
-                                platform::NCCLContextMap *nccl_ctxs) const {
+    const bool use_cuda, platform::MultiNCCLContextMap *nccl_ctxs) const {
 #else
-                                const bool use_cuda) const {
+    const bool use_cuda) const {
 #endif
   VLOG(3) << "apply all passes";
   // Create a default one if not finalized by user.
@@ -285,9 +283,9 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       pass->Set<size_t>(ir::kNRanks, new size_t(nranks));
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
       pass->Erase(kNCCLCtxs);
-      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
+      pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
 #endif
     } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
                pass->Type() == "fuse_adam_op_pass" ||
@@ -301,9 +299,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                                     &local_scopes);
       if (pass->Type() == "fuse_all_reduce_op_pass") {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+        platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
         pass->Erase(kNCCLCtxs);
-        pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
+        pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
+        pass->Erase(kUseHierarchicalAllReduce);
+        pass->Set<bool>(kUseHierarchicalAllReduce,
+                        new bool(use_hierarchical_allreduce_));
 #endif
       }
     } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
@@ -316,6 +317,14 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
     } else if (pass->Type() == "all_reduce_deps_pass") {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+      platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase(kNCCLCtxs);
+      pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
+      pass->Erase(kUseHierarchicalAllReduce);
+      pass->Set<bool>(kUseHierarchicalAllReduce,
+                      new bool(use_hierarchical_allreduce_));
+#endif
       LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
                 << ", num_trainers:" << num_trainers_;
     } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
 
@@ -111,6 +111,17 @@ struct BuildStrategy {
   bool cache_runtime_context_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
 
+  size_t nccl_comm_num_{1};
+  // The picture is here:
+  // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
+  bool use_hierarchical_allreduce_{false};
+  // Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu
+  // cards' number in most cases.
+  size_t hierarchical_allreduce_inter_nranks_{0};
+  // Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to
+  // nodes number.
+  size_t hierarchical_allreduce_exter_nranks_{0};
+
   // NOTE:
   // Before you add new options, think if it's a general strategy that works
   // with other strategy. If not, the strategy should be created through
@@ -136,7 +147,7 @@ struct BuildStrategy {
                    const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
                    const bool use_cuda,
-                   platform::NCCLContextMap *nccl_ctxs) const;
+                   platform::MultiNCCLContextMap *nccl_ctxs) const;
 #else
                    const bool use_cuda) const;
 #endif