Skip to content

Commit 0039afd

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into slim_ligth_nas
2 parents 6151521 + 6d8075e commit 0039afd

File tree

248 files changed

+8199
-2719
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

248 files changed

+8199
-2719
lines changed

Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# A image for building paddle binaries
22
# Use cuda devel base image for both cpu and gpu environment
33
# When you modify it, please be aware of cudnn-runtime version
4-
# and libcudnn.so.x in paddle/scripts/docker/build.sh
54
FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
65
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
76

cmake/external/ngraph.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
3737
INCLUDE(ExternalProject)
3838

3939
SET(NGRAPH_PROJECT "extern_ngraph")
40-
SET(NGRAPH_GIT_TAG "127e0dedfaac8c6f2b148cc03bf5f67ac5fbe6fe")
40+
SET(NGRAPH_GIT_TAG "096ad6ef0c04d57db1522940dbdf9a0652768065")
4141
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
4242
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
4343
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)

cmake/generic.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ function(cc_test TARGET_NAME)
385385
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
386386
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
387387
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
388-
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
388+
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true ${MKL_DEBUG_FLAG})
389389
# No unit test should exceed 10 minutes.
390390
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
391391
endif()

paddle/fluid/API.spec

Lines changed: 73 additions & 70 deletions
Large diffs are not rendered by default.

paddle/fluid/framework/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
225225

226226
cc_test(tuple_test SRCS tuple_test.cc )
227227

228+
cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
229+
228230
if (NOT WIN32)
229231
cc_test(rw_lock_test SRCS rw_lock_test.cc)
230232
endif (NOT WIN32)

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,9 @@ namespace details {
3535
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
3636
const std::vector<Scope *> &local_scopes,
3737
const std::vector<platform::Place> &places,
38-
const platform::NCCLContextMap *ctxs)
39-
: OpHandleBase(node),
40-
local_scopes_(local_scopes),
41-
places_(places),
42-
nccl_ctxs_(ctxs) {
43-
if (nccl_ctxs_) {
44-
for (auto &p : places_) {
45-
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
46-
}
47-
}
38+
const platform::MultiNCCLContextMap *ctxs)
39+
: NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
40+
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
4841
}
4942
#else
5043
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -71,7 +64,9 @@ void AllReduceOpHandle::RunAllReduceFuncs(
7164
if (FLAGS_sync_nccl_allreduce) {
7265
for (auto &p : places_) {
7366
int dev_id = boost::get<platform::CUDAPlace>(p).device;
74-
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
67+
auto *nccl_ctxs =
68+
nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
69+
auto &nccl_ctx = nccl_ctxs->at(dev_id);
7570
auto stream = nccl_ctx.stream();
7671
cudaError_t e_sync = cudaStreamSynchronize(stream);
7772
if (e_sync != 0) {
@@ -134,19 +129,9 @@ void AllReduceOpHandle::RunImpl() {
134129
numel = static_cast<size_t>(lod_tensor.numel());
135130
}
136131

137-
int dev_id = boost::get<platform::CUDAPlace>(p).device;
138-
auto &nccl_ctx = nccl_ctxs_->at(dev_id);
139-
auto stream = nccl_ctx.stream();
140-
auto comm = nccl_ctx.comm_;
141-
142-
VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel
143-
<< ", dev_id:" << dev_id << ", dtype:" << dtype
144-
<< ", place:" << p;
145-
146132
all_reduce_calls.emplace_back([=] {
147-
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
148-
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
149-
comm, stream));
133+
NCCLAllReduce(p, buffer, buffer, numel,
134+
static_cast<ncclDataType_t>(dtype), ncclSum);
150135
});
151136
}
152137
RunAllReduceFuncs(all_reduce_calls);

paddle/fluid/framework/details/all_reduce_op_handle.h

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,23 @@
2121
#include "paddle/fluid/framework/lod_tensor.h"
2222
#include "paddle/fluid/framework/scope.h"
2323
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
24+
#include "paddle/fluid/framework/details/nccl_op_handle.h"
2425
#include "paddle/fluid/platform/nccl_helper.h"
2526
#endif
2627

2728
namespace paddle {
2829
namespace framework {
2930
namespace details {
3031

31-
class AllReduceOpHandle : public OpHandleBase {
32-
public:
3332
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
33+
class AllReduceOpHandle : public NCCLOpHandleBase {
34+
public:
3435
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
3536
const std::vector<platform::Place> &places,
36-
const platform::NCCLContextMap *ctxs);
37+
const platform::MultiNCCLContextMap *ctxs);
3738
#else
39+
class AllReduceOpHandle : public OpHandleBase {
40+
public:
3841
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
3942
const std::vector<platform::Place> &places);
4043
#endif
@@ -46,13 +49,17 @@ class AllReduceOpHandle : public OpHandleBase {
4649

4750
protected:
4851
void RunImpl() override;
49-
5052
std::vector<Scope *> local_scopes_;
53+
54+
#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
55+
// NCCLOpHandleBase already have these attributes.
56+
// Will polish it by class inheritance framework.
5157
std::vector<platform::Place> places_;
58+
#endif
59+
5260
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
5361
void RunAllReduceFuncs(
5462
const std::vector<std::function<void()>> &all_reduce_calls);
55-
const platform::NCCLContextMap *nccl_ctxs_;
5663
#endif
5764
};
5865

paddle/fluid/framework/details/async_ssa_graph_executor.cc

Lines changed: 30 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -51,45 +51,39 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
5151
VLOG(3) << "ProcessGraph";
5252
RpcCtxMap send_varname_to_ctx;
5353
RpcCtxMap recv_varname_to_ctx;
54-
for (auto i = 0; i < graphs.size(); ++i) {
55-
std::vector<ir::Node *> nodes_to_delete;
56-
for (auto &node : graphs[i]->Nodes()) {
57-
VLOG(3) << "node name " << node->Name();
58-
if (node && node->IsOp()) {
59-
if (node->Name() == "send") {
60-
auto send_var_name = node->Op()->Input("X")[0];
61-
auto send_varnames = boost::get<std::vector<std::string>>(
62-
node->Op()->GetNullableAttr("send_varnames"));
63-
auto epmap = boost::get<std::vector<std::string>>(
64-
node->Op()->GetNullableAttr("epmap"));
65-
auto height_section = boost::get<std::vector<int64_t>>(
66-
node->Op()->GetNullableAttr("sections"));
67-
auto trainer_id =
68-
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
69-
send_varname_to_ctx[send_var_name] =
70-
operators::distributed::RpcContext(send_var_name, send_varnames,
71-
epmap, height_section,
72-
trainer_id);
73-
VLOG(3) << "find and init an send op: "
74-
<< send_varname_to_ctx[send_var_name];
75-
} else if (node->Name() == "recv") {
76-
auto recv_var_name = node->Op()->Output("Out")[0];
77-
auto recv_varnames = boost::get<std::vector<std::string>>(
78-
node->Op()->GetNullableAttr("recv_varnames"));
79-
auto epmap = boost::get<std::vector<std::string>>(
80-
node->Op()->GetNullableAttr("epmap"));
81-
auto trainer_id =
82-
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
83-
recv_varname_to_ctx[recv_var_name] =
84-
operators::distributed::RpcContext(recv_var_name, recv_varnames,
85-
epmap, {}, trainer_id);
86-
nodes_to_delete.push_back(node);
87-
VLOG(3) << "find and remove an recv op: "
88-
<< recv_varname_to_ctx[recv_var_name];
89-
}
54+
for (auto &node : graphs[0]->Nodes()) {
55+
VLOG(3) << "node name " << node->Name();
56+
if (node && node->IsOp()) {
57+
if (node->Name() == "send") {
58+
auto send_var_name = node->Op()->Input("X")[0];
59+
auto send_varnames = boost::get<std::vector<std::string>>(
60+
node->Op()->GetNullableAttr("send_varnames"));
61+
auto epmap = boost::get<std::vector<std::string>>(
62+
node->Op()->GetNullableAttr("epmap"));
63+
auto height_section = boost::get<std::vector<int64_t>>(
64+
node->Op()->GetNullableAttr("sections"));
65+
auto trainer_id =
66+
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
67+
send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(
68+
send_var_name, send_varnames, epmap, height_section, trainer_id);
69+
VLOG(3) << "find and init an send op: "
70+
<< send_varname_to_ctx[send_var_name];
71+
} else if (node->Name() == "recv") {
72+
auto recv_var_name = node->Op()->Output("Out")[0];
73+
auto recv_varnames = boost::get<std::vector<std::string>>(
74+
node->Op()->GetNullableAttr("recv_varnames"));
75+
auto epmap = boost::get<std::vector<std::string>>(
76+
node->Op()->GetNullableAttr("epmap"));
77+
auto trainer_id =
78+
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
79+
recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(
80+
recv_var_name, recv_varnames, epmap, {}, trainer_id);
81+
VLOG(3) << "find and remove an recv op: "
82+
<< recv_varname_to_ctx[recv_var_name];
9083
}
9184
}
9285
}
86+
9387
// init communicator here
9488
if (send_varname_to_ctx.size() > 0) {
9589
VLOG(3) << "this is distribute mode, will use communicator";

paddle/fluid/framework/details/build_strategy.cc

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -256,16 +256,14 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
256256
return framework::ir::MultiDevSSAGraphBuilder().count(pass_name) > 0;
257257
}
258258

259-
ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
260-
const std::vector<platform::Place> &places,
261-
const std::string &loss_var_name,
262-
const std::vector<Scope *> &local_scopes,
263-
const size_t &nranks,
259+
ir::Graph *BuildStrategy::Apply(
260+
ir::Graph *graph, const std::vector<platform::Place> &places,
261+
const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
262+
const size_t &nranks,
264263
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
265-
const bool use_cuda,
266-
platform::NCCLContextMap *nccl_ctxs) const {
264+
const bool use_cuda, platform::MultiNCCLContextMap *nccl_ctxs) const {
267265
#else
268-
const bool use_cuda) const {
266+
const bool use_cuda) const {
269267
#endif
270268
VLOG(3) << "apply all passes";
271269
// Create a default one if not finalized by user.
@@ -285,9 +283,9 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
285283
pass->Set<size_t>(ir::kNRanks, new size_t(nranks));
286284

287285
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
288-
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
286+
platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
289287
pass->Erase(kNCCLCtxs);
290-
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
288+
pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
291289
#endif
292290
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
293291
pass->Type() == "fuse_adam_op_pass" ||
@@ -301,9 +299,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
301299
&local_scopes);
302300
if (pass->Type() == "fuse_all_reduce_op_pass") {
303301
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
304-
platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
302+
platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
305303
pass->Erase(kNCCLCtxs);
306-
pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
304+
pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
305+
pass->Erase(kUseHierarchicalAllReduce);
306+
pass->Set<bool>(kUseHierarchicalAllReduce,
307+
new bool(use_hierarchical_allreduce_));
307308
#endif
308309
}
309310
} else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
@@ -316,6 +317,14 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
316317
LOG(INFO) << "set enable_sequential_execution:"
317318
<< enable_sequential_execution_;
318319
} else if (pass->Type() == "all_reduce_deps_pass") {
320+
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
321+
platform::MultiNCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
322+
pass->Erase(kNCCLCtxs);
323+
pass->SetNotOwned<platform::MultiNCCLContextMap>(kNCCLCtxs, nctx);
324+
pass->Erase(kUseHierarchicalAllReduce);
325+
pass->Set<bool>(kUseHierarchicalAllReduce,
326+
new bool(use_hierarchical_allreduce_));
327+
#endif
319328
LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
320329
<< ", num_trainers:" << num_trainers_;
321330
} else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {

paddle/fluid/framework/details/build_strategy.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,17 @@ struct BuildStrategy {
111111
bool cache_runtime_context_{false};
112112
std::unordered_set<std::string> mkldnn_enabled_op_types_;
113113

114+
size_t nccl_comm_num_{1};
115+
// The picture is here:
116+
// https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
117+
bool use_hierarchical_allreduce_{false};
118+
// Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu
119+
// cards' number in most cases.
120+
size_t hierarchical_allreduce_inter_nranks_{0};
121+
// Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to
122+
// nodes number.
123+
size_t hierarchical_allreduce_exter_nranks_{0};
124+
114125
// NOTE:
115126
// Before you add new options, think if it's a general strategy that works
116127
// with other strategy. If not, the strategy should be created through
@@ -136,7 +147,7 @@ struct BuildStrategy {
136147
const size_t &nranks,
137148
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
138149
const bool use_cuda,
139-
platform::NCCLContextMap *nccl_ctxs) const;
150+
platform::MultiNCCLContextMap *nccl_ctxs) const;
140151
#else
141152
const bool use_cuda) const;
142153
#endif

0 commit comments

Comments
 (0)