Skip to content

support npu weight unified H2D copy before inference #39160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions paddle/fluid/inference/analysis/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ struct Argument {
DECL_ARGUMENT_FIELD(ipu_batch_size, IpuBatchSize, int);
DECL_ARGUMENT_FIELD(ipu_need_avg_shard, IpuNeedAvgShard, bool);

// npu related
DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);
DECL_ARGUMENT_FIELD(npu_device_id, NPUDeviceId, int);

private:
std::unordered_set<std::string> valid_fields_;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,50 @@ namespace paddle {
namespace inference {
namespace analysis {

void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
PADDLE_ENFORCE_EQ(
argument->scope_valid(), true,
platform::errors::PreconditionNotMet("The scope field should be valid"));
PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
#ifdef PADDLE_WITH_ASCEND_CL
void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) {
if (!argument->use_npu()) return;

auto &graph = argument->main_graph();
std::vector<std::string> repetitive_params;

if (graph.Has(framework::ir::kRepetitiveParamAttr))
repetitive_params = graph.Get<std::vector<std::string>>(
framework::ir::kRepetitiveParamAttr);

LOG(INFO) << "Sync params from CPU to NPU";

PADDLE_ENFORCE_EQ(argument->npu_device_id_valid(), true,
platform::errors::PreconditionNotMet(
"The use_gpu field should be valid"));
"The npu_device_id field should be valid"));
platform::Place place = platform::NPUPlace(argument->npu_device_id());
auto *scope = argument->scope_ptr();
std::vector<std::string> all_vars = scope->LocalVarNames();

platform::Place place;
for (auto &var_name : all_vars) {
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
"The var should not be nullptr"));

if (var->IsType<framework::LoDTensor>() ||
var->IsType<framework::Tensor>()) {
auto *t = var->GetMutable<framework::LoDTensor>();

platform::CPUPlace cpu_place;
framework::LoDTensor temp_tensor;
temp_tensor.Resize(t->dims());
temp_tensor.mutable_data<float>(cpu_place);

paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
t->clear();
paddle::framework::TensorCopySync(temp_tensor, place, t);
}
}
}

#else

void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
// The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return;

Expand All @@ -47,8 +81,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true,
platform::errors::PreconditionNotMet(
"The gpu_device_id field should be valid"));
place = platform::CUDAPlace(argument->gpu_device_id());

platform::Place place = platform::CUDAPlace(argument->gpu_device_id());
auto *scope = argument->scope_ptr();
std::vector<std::string> all_vars = scope->LocalVarNames();

Expand Down Expand Up @@ -100,6 +133,22 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
}
}

#endif

void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
PADDLE_ENFORCE_EQ(
argument->scope_valid(), true,
platform::errors::PreconditionNotMet("The scope field should be valid"));

#ifdef PADDLE_WITH_ASCEND_CL
if (!argument->use_npu_valid()) return;
CopyParamsToNpu(argument);
#else
if (!argument->use_gpu_valid()) return;
CopyParamsToGpu(argument);
#endif
}

std::string IrParamsSyncAmongDevicesPass::repr() const {
return "ir-params-sync-among-devices-pass";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
public:
void RunImpl(Argument *argument) override;
std::string repr() const override;

private:
#ifdef PADDLE_WITH_ASCEND_CL
void CopyParamsToNpu(Argument *argument);
#else
void CopyParamsToGpu(Argument *argument);
#endif
};

} // namespace analysis
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,9 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetIpuBatchSize(config_.ipu_batch_size_);
argument_.SetIpuNeedAvgShard(config_.ipu_need_avg_shard_);

argument_.SetUseNpu(config_.use_npu_);
argument_.SetNPUDeviceId(config_.npu_device_id());

if (config_.use_mkldnn_) {
LOG(INFO) << "MKLDNN is enabled";
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
Expand Down