Skip to content

Commit e7f5d31

Browse files
Merge branch 'PaddlePaddle:develop' into updatedocs
2 parents 10d4b5e + 4e66010 commit e7f5d31

File tree

290 files changed

+7698
-7156
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

290 files changed

+7698
-7156
lines changed

AUTHORS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
| reyoung | Yang Yu |
5858
| [Sand3r-](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Michal Gallus |
5959
| [sfraczek](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Sylwester Fraczek |
60+
| Silv3S | Slawomir Siwek |
6061
| sneaxiy | Jin-Le Zeng |
6162
| Superjom | Chun-Wei Yan |
6263
| tensor-tang | Jian Tang |

CMakeLists.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,11 @@ if(APPLE AND WITH_ARM)
100100
endif()
101101

102102
if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
103-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
103+
if(WITH_ARM_BRPC)
104+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
105+
else()
106+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
107+
endif()
104108
endif()
105109

106110
if(WIN32)
@@ -386,7 +390,7 @@ if(WITH_DISTRIBUTE)
386390
if(LINUX)
387391
set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
388392
endif()
389-
if(WITH_ASCEND_CL)
393+
if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
390394
# disable WITH_PSCORE for NPU before include third_party
391395
MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
392396
set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
4949
[Click here to learn more](https://github.com/PaddlePaddle/Fleet)
5050

5151

52-
- **High-Performance Inference Engines for Comprehensive Deployment Enviroments**
52+
- **High-Performance Inference Engines for Comprehensive Deployment Environments**
5353

5454
PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
5555

cmake/coverallsGcovJsons.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ foreach (GCOV_FILE ${GCOV_FILES})
238238
message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
239239

240240
# Loads the gcov file as a list of lines.
241-
# (We first open the file and replace all occurences of [] with _
241+
# (We first open the file and replace all occurrences of [] with _
242242
# because CMake will fail to parse a line containing unmatched brackets...
243243
# also the \ to escaped \n in macros screws up things.)
244244
# https://public.kitware.com/Bug/view.php?id=15369

cmake/external/xpu.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME "libxpurt.so")
99

1010
if(NOT DEFINED XPU_BASE_URL)
1111
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
12-
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411")
12+
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425")
1313
else()
1414
SET(XPU_BASE_URL "${XPU_BASE_URL}")
1515
endif()

cmake/flags.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,10 @@ if(WITH_IPU)
158158
)
159159
endif()
160160

161+
if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
162+
set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
163+
endif()
164+
161165
if(NOT APPLE)
162166
if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
163167
set(COMMON_FLAGS

cmake/hip.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,10 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
7171
# host linker to link.
7272
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
7373
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
74+
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908)
7475
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
7576
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
77+
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)
7678

7779

7880
if(HIP_COMPILER STREQUAL clang)

paddle/fluid/distributed/collective/ProcessGroupHeter.cc

100755100644
Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// limitations under the License.
1414

1515
#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
16+
#include <chrono>
1617
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
1718
#include "paddle/fluid/platform/place.h"
1819
#include "paddle/phi/api/include/api.h"
@@ -24,6 +25,8 @@ namespace paddle {
2425
namespace distributed {
2526

2627
using Place = paddle::platform::Place;
28+
int ProcessGroupHeter::send_count = 0;
29+
int ProcessGroupHeter::recv_count = 0;
2730

2831
std::shared_ptr<ProcessGroupHeter::HeterTask> ProcessGroupHeter::CreateTask(
2932
int rank, CommType comm_type, const std::vector<phi::DenseTensor>& inputs) {
@@ -47,15 +50,19 @@ bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
4750
ProcessGroupHeter::ProcessGroupHeter(
4851
const std::shared_ptr<Store>& store, int rank, int size,
4952
const platform::Place& place, int gid, int local_rank, int local_size,
50-
int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint)
53+
int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint,
54+
int src_rank, int dst_rank)
5155
: ProcessGroup(rank, size, place, gid),
5256
store_(store),
5357
local_rank_(local_rank),
5458
local_size_(local_size),
5559
gloo_rank_(gloo_rank),
5660
gloo_size_(gloo_size),
5761
with_switch_(with_switch),
58-
switch_endpoint_(switch_endpoint) {
62+
switch_endpoint_(switch_endpoint),
63+
src_rank_(src_rank),
64+
dst_rank_(dst_rank) {
65+
return;
5966
#if defined(PADDLE_WITH_NCCL)
6067
inner_pg_ = std::make_shared<ProcessGroupNCCL>(store, local_rank, local_size,
6168
place_, IGNORE_ID);
@@ -246,5 +253,100 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
246253
return CreateTask(rank_, CommType::BROADCAST, in_tensors);
247254
}
248255

256+
std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
257+
std::vector<phi::DenseTensor>& in_tensors, int peer) {
258+
#if defined(PADDLE_WITH_NCCL)
259+
PADDLE_ENFORCE_EQ(
260+
CheckTensorsInCudaPlace(in_tensors), true,
261+
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
262+
#endif
263+
264+
PADDLE_ENFORCE_EQ(
265+
in_tensors.size(), 1,
266+
platform::errors::PreconditionNotMet(
267+
"For each send operation, there can only be one tensor to send."));
268+
// Copy Tensor to cpu
269+
auto start = std::chrono::high_resolution_clock::now();
270+
phi::DenseTensor cpu_tensor;
271+
auto& gpu_tensor = in_tensors[0];
272+
framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
273+
PADDLE_ENFORCE_EQ(with_switch_, true,
274+
platform::errors::PreconditionNotMet(
275+
"Gloo does not support the send operation."));
276+
auto end = std::chrono::high_resolution_clock::now();
277+
std::chrono::duration<double> diff = end - start;
278+
VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
279+
<< ") from gpu to cpu for send " << std::setw(9)
280+
<< " is: " << diff.count() << " s" << std::endl;
281+
282+
// Send to switch
283+
HeterClient* client_ =
284+
HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
285+
int64_t tensor_size =
286+
cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype());
287+
std::vector<int64_t> send_size;
288+
send_size.push_back(tensor_size);
289+
auto id = src_rank_ * 10000 + dst_rank_;
290+
std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
291+
std::string("_") + std::to_string(send_count++);
292+
VLOG(2) << "tensor_name:" << tensor_name;
293+
int ret = client_->Send(gid_, {tensor_name}, send_size, cpu_tensor.data(),
294+
tensor_size);
295+
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
296+
"Send to the switch module error."));
297+
return CreateTask(rank_, CommType::SEND, in_tensors);
298+
}
299+
300+
std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
301+
std::vector<phi::DenseTensor>& out_tensors, int peer) {
302+
#if defined(PADDLE_WITH_NCCL)
303+
PADDLE_ENFORCE_EQ(
304+
CheckTensorsInCudaPlace(out_tensors), true,
305+
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
306+
#endif
307+
308+
PADDLE_ENFORCE_EQ(
309+
out_tensors.size(), 1,
310+
platform::errors::PreconditionNotMet(
311+
"For each rece operation, there can only be one tensor to receive."));
312+
313+
// Copy Tensor to cpu
314+
phi::DenseTensor cpu_tensor;
315+
auto& gpu_tensor = out_tensors[0];
316+
cpu_tensor.Resize(gpu_tensor.dims());
317+
cpu_tensor.set_layout(gpu_tensor.layout());
318+
cpu_tensor.mutable_data(platform::CPUPlace(), gpu_tensor.dtype());
319+
320+
PADDLE_ENFORCE_EQ(with_switch_, true,
321+
platform::errors::PreconditionNotMet(
322+
"Gloo does not support the send operation."));
323+
// recv from switch
324+
HeterClient* client_ =
325+
HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
326+
auto id = src_rank_ * 10000 + dst_rank_;
327+
std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
328+
std::string("_") + std::to_string(recv_count++);
329+
VLOG(2) << "tensor_name: " << tensor_name;
330+
auto start = std::chrono::high_resolution_clock::now();
331+
int ret = client_->Recv(
332+
gid_, {tensor_name}, cpu_tensor.data(),
333+
cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
334+
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
335+
"receive to the switch module error."));
336+
auto end = std::chrono::high_resolution_clock::now();
337+
std::chrono::duration<double> diff = end - start;
338+
double goodput = cpu_tensor.numel() *
339+
framework::DataTypeSize(cpu_tensor.dtype()) / diff.count();
340+
VLOG(2) << "Goodput: " << goodput << "B/s" << std::endl;
341+
start = std::chrono::high_resolution_clock::now();
342+
framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
343+
end = std::chrono::high_resolution_clock::now();
344+
diff = end - start;
345+
VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
346+
<< ") from gpu to cpu for recv " << std::setw(9)
347+
<< " is: " << diff.count() << " s" << std::endl;
348+
return CreateTask(rank_, CommType::RECV, out_tensors);
349+
}
350+
249351
} // namespace distributed
250352
} // namespace paddle

paddle/fluid/distributed/collective/ProcessGroupHeter.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ class ProcessGroupHeter : public ProcessGroup {
8383
ProcessGroupHeter(const std::shared_ptr<Store>& store, int rank, int size,
8484
const platform::Place& place, int gid, int local_rank,
8585
int local_size, int gloo_rank, int gloo_size,
86-
bool with_switch, std::string switch_endpoints);
86+
bool with_switch, std::string switch_endpoints,
87+
int src_rank, int dst_rank);
8788

8889
const std::string GetBackendName() const override {
8990
return std::string(HETER_BACKEND_NAME);
@@ -97,6 +98,12 @@ class ProcessGroupHeter : public ProcessGroup {
9798
std::vector<phi::DenseTensor>&, std::vector<phi::DenseTensor>&,
9899
const BroadcastOptions& = BroadcastOptions()) override;
99100

101+
std::shared_ptr<ProcessGroup::Task> Send(
102+
std::vector<phi::DenseTensor>& in_tensors, int peer) override;
103+
104+
std::shared_ptr<ProcessGroup::Task> Recv(
105+
std::vector<phi::DenseTensor>& out_tensors, int peer) override;
106+
100107
protected:
101108
virtual std::shared_ptr<ProcessGroupHeter::HeterTask> CreateTask(
102109
int rank, CommType opType, const std::vector<phi::DenseTensor>& inputs);
@@ -112,6 +119,10 @@ class ProcessGroupHeter : public ProcessGroup {
112119
int gloo_size_;
113120
bool with_switch_;
114121
std::string switch_endpoint_;
122+
int src_rank_;
123+
int dst_rank_;
124+
static int send_count;
125+
static int recv_count;
115126
};
116127

117128
} // namespace distributed

0 commit comments

Comments
 (0)