Skip to content

Commit

Permalink
[NPU] Support async copy for TensorFromVector with event (PaddlePaddl…
Browse files Browse the repository at this point in the history
  • Loading branch information
liym27 authored May 12, 2021
1 parent f1d6302 commit 85512d6
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 32 deletions.
57 changes: 53 additions & 4 deletions paddle/fluid/framework/tensor_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ limitations under the License. */
#include "paddle/fluid/framework/dlpack_tensor.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/platform/device_context.h"

namespace paddle {
Expand Down Expand Up @@ -166,8 +170,30 @@ void TensorFromVector(const std::vector<T>& src,
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
// 1. vector -> npu pinned tensor
Tensor npu_pinned_tensor(dst->type());
platform::NPUPinnedPlace npu_pinned_place;
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data<T>(dst->dims(), npu_pinned_place);
memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);

// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());

// 3. record event
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
paddle::memory::allocation::Allocation* allocation =
npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
}
Expand Down Expand Up @@ -206,8 +232,31 @@ inline void TensorFromVector(const std::vector<bool>& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
// 1. vector -> npu pinned tensor
platform::NPUPinnedPlace npu_pinned_place;
Tensor npu_pinned_tensor;
npu_pinned_tensor.Resize(dst->dims());
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data(npu_pinned_place, dst->type());
memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);

// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());

// 3. record event
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
paddle::memory::allocation::Allocation* allocation =
npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
delete[] array;
Expand Down
59 changes: 31 additions & 28 deletions paddle/fluid/operators/npu_op_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ limitations under the License. */
#include <vector>

#include "acl/acl.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/npu_op_runner.h"

namespace paddle {
Expand All @@ -30,6 +31,7 @@ using Tensor = framework::Tensor;
using DataLayout = framework::DataLayout;
using NPUAttribute = framework::NPUAttribute;
using NPUAttributeMap = framework::NPUAttributeMap;
using DeviceContextPool = platform::DeviceContextPool;

class NpuOpRunner {
public:
Expand Down Expand Up @@ -90,41 +92,42 @@ aclrtStream GetCurrentNPUStream(int device_id = -1);

template <typename T>
void FillNpuTensorWithConstant(Tensor *tensor, T val) {
// NOTE(zhiqiu): we found that power sometimes returns 0 when val is small
// like 1e-8.
constexpr float MIN_PRECISION_FOR_POWER = 1e-3;
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("The tensor should be initialized."));
PADDLE_ENFORCE_EQ(
platform::is_npu_place(tensor->place()), true,
platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
// do async for better performance
if ((typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) &&
static_cast<float>(val) > MIN_PRECISION_FOR_POWER) {
Tensor tmp(tensor->type());
tmp.Resize(tensor->dims());
tmp.mutable_data<T>(tensor->place());
auto stream = GetCurrentNPUStream(
BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
stream);
auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(val)}});
runner.Run(stream);
} else {
T *array = new T[tensor->numel()];
for (unsigned int i = 0; i < tensor->numel(); ++i) {
array[i] = static_cast<T>(val);
}
std::vector<T> vec(tensor->numel(), static_cast<T>(val));
// do sync copy

int numel = tensor->numel();
if (numel == 1) {
Tensor npu_pinned_tensor(tensor->type());
platform::NPUPinnedPlace npu_pinned_place;
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data<T>({1}, npu_pinned_place);
*npu_pinned_ptr = val;

memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
tensor->data<void>(), platform::CPUPlace(), array,
tensor->numel() * sizeof(T), nullptr);
delete[] array;
tensor->data<void>(), npu_pinned_place, npu_pinned_ptr,
sizeof(T), GetCurrentNPUStream());

auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator *>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
paddle::memory::allocation::Allocation *allocation =
npu_pinned_tensor.Holder().get();

npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
} else {
std::vector<T> vec(numel, static_cast<T>(val));
auto device_id = platform::GetCurrentNPUDeviceId();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
pool.Get(platform::NPUPlace(device_id)));

paddle::framework::TensorFromVector<T>(vec, *dev_ctx, tensor);
}
}

Expand Down

0 comments on commit 85512d6

Please sign in to comment.