Skip to content

Commit 9a3054c

Browse files
authored
[Inference]ort backend optimizer (#44136)
* add ort clone interface * paddle2onnx update to 1.0.0rc * ort input_tensor use mutable data of scope
1 parent 13a250a commit 9a3054c

File tree

4 files changed

+61
-149
lines changed

4 files changed

+61
-149
lines changed

cmake/external/paddle2onnx.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ endif()
2424
include(ExternalProject)
2525

2626
set(PADDLE2ONNX_PROJECT "extern_paddle2onnx")
27-
set(PADDLE2ONNX_VERSION "0.9.9")
27+
set(PADDLE2ONNX_VERSION "1.0.0rc")
2828
set(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx)
2929
set(PADDLE2ONNX_SOURCE_DIR
3030
${THIRD_PARTY_PATH}/paddle2onnx/src/${PADDLE2ONNX_PROJECT})

paddle/fluid/inference/api/details/zero_copy_tensor.cc

Lines changed: 0 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -179,13 +179,6 @@ PlaceType Tensor::place() const { return place_; }
179179

180180
template <typename T>
181181
void Tensor::CopyFromCpu(const T *data) {
182-
#ifdef PADDLE_WITH_ONNXRUNTIME
183-
if (is_ort_tensor_) {
184-
ORTCopyFromCpu<T>(data);
185-
return;
186-
}
187-
#endif
188-
189182
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
190183
PADDLE_ENFORCE_GE(tensor->numel(),
191184
0,
@@ -731,112 +724,6 @@ void Tensor::SetOrtBuffer(const std::shared_ptr<std::vector<int8_t>> buffer) {
731724
buffer_ = buffer;
732725
}
733726

734-
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
735-
float *data,
736-
size_t size,
737-
const int64_t *shape,
738-
size_t shape_len) {
739-
return Ort::Value::CreateTensor<float>(
740-
memory_info, data, size, shape, shape_len);
741-
}
742-
743-
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
744-
int64_t *data,
745-
size_t size,
746-
const int64_t *shape,
747-
size_t shape_len) {
748-
return Ort::Value::CreateTensor<int64_t>(
749-
memory_info, data, size, shape, shape_len);
750-
}
751-
752-
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
753-
int32_t *data,
754-
size_t size,
755-
const int64_t *shape,
756-
size_t shape_len) {
757-
return Ort::Value::CreateTensor<int32_t>(
758-
memory_info, data, size, shape, shape_len);
759-
}
760-
761-
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
762-
uint8_t *data,
763-
size_t size,
764-
const int64_t *shape,
765-
size_t shape_len) {
766-
return Ort::Value::CreateTensor<uint8_t>(
767-
memory_info, data, size, shape, shape_len);
768-
}
769-
770-
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
771-
int8_t *data,
772-
size_t size,
773-
const int64_t *shape,
774-
size_t shape_len) {
775-
return Ort::Value::CreateTensor<int8_t>(
776-
memory_info, data, size, shape, shape_len);
777-
}
778-
779-
Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
780-
float16 *data,
781-
size_t size,
782-
const int64_t *shape,
783-
size_t shape_len) {
784-
return Ort::Value::CreateTensor(memory_info,
785-
static_cast<void *>(data),
786-
size * sizeof(float16),
787-
shape,
788-
shape_len,
789-
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
790-
}
791-
792-
template <typename T>
793-
void Tensor::ORTCopyFromCpu(const T *data) {
794-
auto binding = binding_.lock();
795-
PADDLE_ENFORCE_NOT_NULL(binding,
796-
paddle::platform::errors::PreconditionNotMet(
797-
"input tensor [%s] no binding ptr", name_));
798-
const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
799-
Ort::MemoryInfo memory_info(
800-
device_name, OrtDeviceAllocator, device_, OrtMemTypeDefault);
801-
size_t size = std::accumulate(
802-
begin(shape_), end(shape_), 1UL, std::multiplies<size_t>());
803-
auto buffer = buffer_.lock();
804-
size_t buffer_size = size * sizeof(T);
805-
if (buffer_size > buffer->size()) {
806-
buffer->resize(buffer_size);
807-
}
808-
std::memcpy(static_cast<void *>(buffer->data()), data, buffer_size);
809-
810-
auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
811-
if (std::is_same<T, float>::value) {
812-
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
813-
} else if (std::is_same<T, double>::value) {
814-
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
815-
} else if (std::is_same<T, int64_t>::value) {
816-
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
817-
} else if (std::is_same<T, int32_t>::value) {
818-
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
819-
} else if (std::is_same<T, uint8_t>::value) {
820-
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;
821-
} else if (std::is_same<T, int8_t>::value) {
822-
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
823-
} else if (std::is_same<T, float16>::value) {
824-
onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
825-
} else {
826-
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
827-
"Found undefined data type for onnxruntime, only supports "
828-
"float16/float32/float64/int8/uint8/int32/int64."));
829-
}
830-
831-
auto ort_value = Ort::Value::CreateTensor(memory_info,
832-
buffer->data(),
833-
buffer_size,
834-
shape_.data(),
835-
shape_.size(),
836-
onnx_dtype);
837-
binding->BindInput(name_.c_str(), ort_value);
838-
}
839-
840727
template <typename T>
841728
void Tensor::ORTCopyToCpu(T *data) const {
842729
auto binding = binding_.lock();
@@ -857,13 +744,6 @@ void Tensor::ORTCopyToCpu(T *data) const {
857744
}
858745
}
859746

860-
template void Tensor::ORTCopyFromCpu<float>(const float *data);
861-
template void Tensor::ORTCopyFromCpu<int64_t>(const int64_t *data);
862-
template void Tensor::ORTCopyFromCpu<int32_t>(const int32_t *data);
863-
template void Tensor::ORTCopyFromCpu<uint8_t>(const uint8_t *data);
864-
template void Tensor::ORTCopyFromCpu<int8_t>(const int8_t *data);
865-
template void Tensor::ORTCopyFromCpu<float16>(const float16 *data);
866-
867747
template void Tensor::ORTCopyToCpu<float>(float *data) const;
868748
template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
869749
template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;

paddle/fluid/inference/api/onnxruntime_predictor.cc

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,10 @@
2424
#include <utility>
2525
#include <vector>
2626

27-
#include "paddle/fluid//platform/device/gpu/gpu_types.h"
2827
#include "paddle/fluid/framework/scope.h"
29-
#include "paddle/fluid/framework/version.h"
28+
#include "paddle/fluid/framework/var_type_traits.h"
29+
#include "paddle/fluid/framework/variable_helper.h"
3030
#include "paddle/fluid/inference/analysis/helper.h"
31-
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
3231
#include "paddle/fluid/inference/api/helper.h"
3332
#include "paddle/fluid/inference/api/paddle_inference_api.h"
3433
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
@@ -97,6 +96,7 @@ bool ONNXRuntimePredictor::Init() {
9796
} else {
9897
place_ = paddle::platform::CPUPlace();
9998
}
99+
scope_.reset(new paddle::framework::Scope());
100100

101101
char *onnx_proto = nullptr;
102102
int out_size;
@@ -147,6 +147,8 @@ bool ONNXRuntimePredictor::Init() {
147147
Ort::Allocator allocator(session_, memory_info);
148148

149149
size_t n_inputs = session_.GetInputCount();
150+
framework::proto::VarType::Type proto_type =
151+
framework::proto::VarType::LOD_TENSOR;
150152
for (size_t i = 0; i < n_inputs; ++i) {
151153
auto input_name = session_.GetInputName(i, allocator);
152154
auto type_info = session_.GetInputTypeInfo(i);
@@ -155,6 +157,10 @@ bool ONNXRuntimePredictor::Init() {
155157
ONNXTensorElementDataType data_type =
156158
type_info.GetTensorTypeAndShapeInfo().GetElementType();
157159
input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
160+
161+
auto *ptr = scope_->Var(input_name);
162+
framework::InitializeVariable(ptr, proto_type);
163+
158164
allocator.Free(input_name);
159165
}
160166

@@ -249,13 +255,13 @@ bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
249255

250256
std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
251257
const std::string &name) {
252-
PADDLE_ENFORCE_EQ(FindONNXDesc(name, true),
253-
true,
254-
platform::errors::PreconditionNotMet(
255-
"The in variable named %s is not found in the "
256-
"ONNXPredictor.",
257-
name));
258-
std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr, this));
258+
PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
259+
platform::errors::PreconditionNotMet(
260+
"The in variable named %s is not found in the "
261+
"ONNXPredictor.",
262+
name));
263+
std::unique_ptr<ZeroCopyTensor> res(
264+
new ZeroCopyTensor(static_cast<void *>(scope_.get()), this));
259265
res->input_or_output_ = true;
260266
res->SetName(name);
261267
if (platform::is_cpu_place(place_)) {
@@ -264,16 +270,6 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
264270
auto gpu_place = place_;
265271
res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
266272
}
267-
res->SetOrtMark(true);
268-
res->SetOrtBinding(binding_);
269-
auto iter = input_buffers_.find(name);
270-
if (iter == input_buffers_.end()) {
271-
std::vector<int8_t> i_vector;
272-
input_buffers_[name] = std::make_shared<std::vector<int8_t>>(i_vector);
273-
res->SetOrtBuffer(input_buffers_[name]);
274-
} else {
275-
res->SetOrtBuffer(iter->second);
276-
}
277273
return res;
278274
}
279275

@@ -306,6 +302,24 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
306302
return res;
307303
}
308304

305+
Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
306+
const char *device_name) {
307+
Ort::MemoryInfo memory_info(
308+
device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault);
309+
auto *var = scope_->FindVar(desc.name);
310+
auto *tensor = var->GetMutable<framework::LoDTensor>();
311+
size_t size =
312+
tensor->numel() *
313+
framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
314+
std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
315+
return Ort::Value::CreateTensor(memory_info,
316+
static_cast<void *>(tensor->data()),
317+
size,
318+
shape.data(),
319+
shape.size(),
320+
desc.dtype);
321+
}
322+
309323
bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
310324
std::vector<PaddleTensor> *output_data,
311325
int batch_size) {
@@ -315,7 +329,13 @@ bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
315329

316330
bool ONNXRuntimePredictor::ZeroCopyRun() {
317331
try {
318-
const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
332+
const char *device_name = platform::is_cpu_place(place_) ? "Cpu" : "Cuda";
333+
std::vector<Ort::Value> inputs;
334+
inputs.reserve(input_desc_.size());
335+
for (auto desc : input_desc_) {
336+
inputs.push_back(GetOrtValue(desc, device_name));
337+
binding_->BindInput(desc.name.c_str(), inputs.back());
338+
}
319339
for (auto output : output_desc_) {
320340
Ort::MemoryInfo out_memory_info(device_name,
321341
OrtDeviceAllocator,
@@ -333,8 +353,10 @@ bool ONNXRuntimePredictor::ZeroCopyRun() {
333353
}
334354

335355
std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone(void *stream) {
336-
LOG(ERROR) << "Not support Clone(), Please create new Predictor";
337-
return nullptr;
356+
std::lock_guard<std::mutex> lk(clone_mutex_);
357+
auto *x = new ONNXRuntimePredictor(config_);
358+
x->Init();
359+
return std::unique_ptr<PaddlePredictor>(x);
338360
}
339361

340362
uint64_t ONNXRuntimePredictor::TryShrinkMemory() {

paddle/fluid/inference/api/onnxruntime_predictor.h

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
#include "onnxruntime_c_api.h" // NOLINT
2323
#include "onnxruntime_cxx_api.h" // NOLINT
24-
#include "paddle/fluid/framework/naive_executor.h"
25-
#include "paddle/fluid/framework/op_compatible_info.h"
2624
#include "paddle/fluid/inference/analysis/analyzer.h"
2725
#include "paddle/fluid/inference/api/api_impl.h"
2826
#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
@@ -94,7 +92,7 @@ class ONNXRuntimePredictor : public PaddlePredictor {
9492
/// \param[in] AnalysisConfig config
9593
///
9694
explicit ONNXRuntimePredictor(const AnalysisConfig &config)
97-
: config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") {
95+
: env_(ORT_LOGGING_LEVEL_WARNING, "onnx"), config_(config) {
9896
predictor_id_ = inference::GetUniqueId();
9997
}
10098
///
@@ -176,6 +174,8 @@ class ONNXRuntimePredictor : public PaddlePredictor {
176174
///
177175
std::unique_ptr<PaddlePredictor> Clone(void *stream = nullptr) override;
178176

177+
std::shared_ptr<framework::Scope> scope_;
178+
179179
protected:
180180
const void *GetDeviceContexts() const override;
181181

@@ -191,14 +191,24 @@ class ONNXRuntimePredictor : public PaddlePredictor {
191191
///
192192
bool FindONNXDesc(const std::string &name, bool is_input);
193193

194-
private:
195-
AnalysisConfig config_;
194+
/// \brief get the Ort Value(input Tensor).
195+
///
196+
/// \param[in] desc ONNXDesce(name、shape、dtype)
197+
///
198+
/// \param[in] device_name "cpu" or "gpu" of device
199+
///
200+
/// \return get a Ort::Value
201+
///
202+
Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
196203

204+
private:
197205
// ONNXRuntime
198206
Ort::Env env_;
199207
Ort::Session session_{nullptr};
200208
std::shared_ptr<Ort::IoBinding> binding_;
201209

210+
AnalysisConfig config_;
211+
std::mutex clone_mutex_;
202212
platform::Place place_;
203213
std::vector<ONNXDesc> input_desc_;
204214
std::vector<ONNXDesc> output_desc_;

0 commit comments

Comments
 (0)