Skip to content

Commit

Permalink
[Serving][Backend] Backend support zero_copy_infer and Serving reduce…
Browse files Browse the repository at this point in the history
… the output memory copy (#703)

* backend add zero copy infer interface

* fix bug

* fix bug

* fix bug

* paddle ipu
  • Loading branch information
heliqi authored Nov 28, 2022
1 parent edcf150 commit 42f1888
Show file tree
Hide file tree
Showing 21 changed files with 254 additions and 109 deletions.
5 changes: 4 additions & 1 deletion fastdeploy/backends/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,11 @@ class BaseBackend {
virtual TensorInfo GetOutputInfo(int index) = 0;
virtual std::vector<TensorInfo> GetInputInfos() = 0;
virtual std::vector<TensorInfo> GetOutputInfos() = 0;
// if copy_to_fd is true, copy memory data to FDTensor
// else share memory to FDTensor(only Paddle、ORT、TRT、OpenVINO support it)
virtual bool Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) = 0;
std::vector<FDTensor>* outputs,
bool copy_to_fd = true) = 0;
virtual std::unique_ptr<BaseBackend> Clone(void *stream = nullptr,
int device_id = -1) {
FDERROR << "Clone no support" << std::endl;
Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/backends/lite/lite_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,8 @@ TensorInfo LiteBackend::GetOutputInfo(int index) {
std::vector<TensorInfo> LiteBackend::GetOutputInfos() { return outputs_desc_; }

bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) {
std::vector<FDTensor>* outputs,
bool copy_to_fd) {
if (inputs.size() != inputs_desc_.size()) {
FDERROR << "[LiteBackend] Size of inputs(" << inputs.size()
<< ") should keep same with the inputs of this model("
Expand Down
4 changes: 3 additions & 1 deletion fastdeploy/backends/lite/lite_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ class LiteBackend : public BaseBackend {
const std::string& params_file,
const LiteBackendOption& option = LiteBackendOption());

bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs) override; // NOLINT
bool Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs,
bool copy_to_fd = true) override; // NOLINT

int NumInputs() const override { return inputs_desc_.size(); }

Expand Down
20 changes: 15 additions & 5 deletions fastdeploy/backends/openvino/ov_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,8 @@ int OpenVINOBackend::NumInputs() const { return input_infos_.size(); }
int OpenVINOBackend::NumOutputs() const { return output_infos_.size(); }

bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) {
std::vector<FDTensor>* outputs,
bool copy_to_fd) {
if (inputs.size() != input_infos_.size()) {
FDERROR << "[OpenVINOBackend] Size of the inputs(" << inputs.size()
<< ") should keep same with the inputs of this model("
Expand All @@ -364,11 +365,20 @@ bool OpenVINOBackend::Infer(std::vector<FDTensor>& inputs,
auto out_tensor_shape = out_tensor.get_shape();
std::vector<int64_t> shape(out_tensor_shape.begin(),
out_tensor_shape.end());
(*outputs)[i].Allocate(shape,
if(copy_to_fd) {
(*outputs)[i].Resize(shape,
OpenVINODataTypeToFD(out_tensor.get_element_type()),
output_infos_[i].name);
memcpy((*outputs)[i].MutableData(), out_tensor.data(),
(*outputs)[i].Nbytes());
output_infos_[i].name,
Device::CPU);
memcpy((*outputs)[i].MutableData(), out_tensor.data(),
(*outputs)[i].Nbytes());
} else {
(*outputs)[i].name = output_infos_[i].name;
(*outputs)[i].SetExternalData(shape,
OpenVINODataTypeToFD(out_tensor.get_element_type()),
out_tensor.data(),
Device::CPU);
}
}
return true;
}
Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/backends/openvino/ov_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ class OpenVINOBackend : public BaseBackend {
const OpenVINOBackendOption& option = OpenVINOBackendOption());

bool Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) override;
std::vector<FDTensor>* outputs,
bool copy_to_fd = true) override;

int NumInputs() const override;

Expand Down
24 changes: 17 additions & 7 deletions fastdeploy/backends/ort/ort_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ bool OrtBackend::InitFromOnnx(const std::string& model_file,
return true;
}

void OrtBackend::CopyToCpu(const Ort::Value& value, FDTensor* tensor,
const std::string& name) {
void OrtBackend::OrtValueToFDTensor(const Ort::Value& value, FDTensor* tensor,
const std::string& name, bool copy_to_fd) {
const auto info = value.GetTensorTypeAndShapeInfo();
const auto data_type = info.GetElementType();
size_t numel = info.GetElementCount();
Expand Down Expand Up @@ -210,12 +210,21 @@ void OrtBackend::CopyToCpu(const Ort::Value& value, FDTensor* tensor,
"Unrecognized data type of %d while calling OrtBackend::CopyToCpu().",
data_type);
}
tensor->Resize(shape, dtype, name);
memcpy(tensor->MutableData(), value.GetTensorData<void*>(), numel);
const void* value_ptr = value.GetTensorData<void*>();
if (copy_to_fd) {
tensor->Resize(shape, dtype, name);
memcpy(tensor->MutableData(), value_ptr, numel);
} else {
tensor->name = name;
tensor->SetExternalData(
shape, dtype,
const_cast<void*>(value_ptr), Device::CPU);
}
}

bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) {
std::vector<FDTensor>* outputs,
bool copy_to_fd) {
if (inputs.size() != inputs_desc_.size()) {
FDERROR << "[OrtBackend] Size of the inputs(" << inputs.size()
<< ") should keep same with the inputs of this model("
Expand Down Expand Up @@ -243,11 +252,12 @@ bool OrtBackend::Infer(std::vector<FDTensor>& inputs,
return false;
}

// Copy result after inference
// Convert result after inference
std::vector<Ort::Value> ort_outputs = binding_->GetOutputValues();
outputs->resize(ort_outputs.size());
for (size_t i = 0; i < ort_outputs.size(); ++i) {
CopyToCpu(ort_outputs[i], &((*outputs)[i]), outputs_desc_[i].name);
OrtValueToFDTensor(ort_outputs[i], &((*outputs)[i]),
outputs_desc_[i].name, copy_to_fd);
}

return true;
Expand Down
7 changes: 4 additions & 3 deletions fastdeploy/backends/ort/ort_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ class OrtBackend : public BaseBackend {
bool from_memory_buffer = false);

bool Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) override;
std::vector<FDTensor>* outputs,
bool copy_to_fd = true) override;

int NumInputs() const override { return inputs_desc_.size(); }

Expand All @@ -92,7 +93,7 @@ class OrtBackend : public BaseBackend {
Ort::CustomOpDomain custom_op_domain_ = Ort::CustomOpDomain("Paddle");
#endif
OrtBackendOption option_;
void CopyToCpu(const Ort::Value& value, FDTensor* tensor,
const std::string& name);
void OrtValueToFDTensor(const Ort::Value& value, FDTensor* tensor,
const std::string& name, bool copy_to_fd);
};
} // namespace fastdeploy
14 changes: 11 additions & 3 deletions fastdeploy/backends/paddle/paddle_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ std::vector<TensorInfo> PaddleBackend::GetOutputInfos() {
}

bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) {
std::vector<FDTensor>* outputs,
bool copy_to_fd) {
if (inputs.size() != inputs_desc_.size()) {
FDERROR << "[PaddleBackend] Size of inputs(" << inputs.size()
<< ") should keep same with the inputs of this model("
Expand All @@ -208,11 +209,18 @@ bool PaddleBackend::Infer(std::vector<FDTensor>& inputs,
}

predictor_->Run();

// output share backend memory only support CPU or GPU
if(option_.use_ipu) {
copy_to_fd = true;
}
outputs->resize(outputs_desc_.size());
for (size_t i = 0; i < outputs_desc_.size(); ++i) {
auto handle = predictor_->GetOutputHandle(outputs_desc_[i].name);
(*outputs)[i].is_pinned_memory = option_.enable_pinned_memory;
CopyTensorToCpu(handle, &((*outputs)[i]));
if(copy_to_fd) {
(*outputs)[i].is_pinned_memory = option_.enable_pinned_memory;
}
PaddleTensorToFDTensor(handle, &((*outputs)[i]), copy_to_fd);
}
return true;
}
Expand Down
13 changes: 9 additions & 4 deletions fastdeploy/backends/paddle/paddle_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,12 @@ paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device);
// Share memory buffer with paddle_infer::Tensor from fastdeploy::FDTensor
void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, FDTensor& fd_tensor);

// Copy memory data from paddle_infer::Tensor to fastdeploy::FDTensor
void CopyTensorToCpu(std::unique_ptr<paddle_infer::Tensor>& tensor,
FDTensor* fd_tensor);
// convert paddle_infer::Tensor to fastdeploy::FDTensor
// if copy_to_fd is true, copy memory data to FDTensor
/// else share memory to FDTensor
void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
FDTensor* fd_tensor,
bool copy_to_fd);

// Convert data type from paddle inference to fastdeploy
FDDataType PaddleDataTypeToFD(const paddle_infer::DataType& dtype);
Expand All @@ -108,7 +111,9 @@ class PaddleBackend : public BaseBackend {
const PaddleBackendOption& option = PaddleBackendOption());

bool Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) override;
std::vector<FDTensor>* outputs,
bool copy_to_fd = true) override;


int NumInputs() const override { return inputs_desc_.size(); }

Expand Down
44 changes: 30 additions & 14 deletions fastdeploy/backends/paddle/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,41 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor,
Str(fd_tensor.dtype).c_str());
}

void CopyTensorToCpu(std::unique_ptr<paddle_infer::Tensor>& tensor,
FDTensor* fd_tensor) {
void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
FDTensor* fd_tensor,
bool copy_to_fd) {
auto fd_dtype = PaddleDataTypeToFD(tensor->type());
std::vector<int64_t> shape;
auto tmp_shape = tensor->shape();
shape.assign(tmp_shape.begin(), tmp_shape.end());
fd_tensor->Resize(shape, fd_dtype, tensor->name());
if (fd_tensor->dtype == FDDataType::FP32) {
tensor->CopyToCpu(static_cast<float*>(fd_tensor->MutableData()));
return;
} else if (fd_tensor->dtype == FDDataType::INT32) {
tensor->CopyToCpu(static_cast<int32_t*>(fd_tensor->MutableData()));
return;
} else if (fd_tensor->dtype == FDDataType::INT64) {
tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor->MutableData()));
return;
if(copy_to_fd) {
fd_tensor->Resize(shape, fd_dtype, tensor->name());
if (fd_tensor->dtype == FDDataType::FP32) {
tensor->CopyToCpu(static_cast<float*>(fd_tensor->MutableData()));
return;
} else if (fd_tensor->dtype == FDDataType::INT32) {
tensor->CopyToCpu(static_cast<int32_t*>(fd_tensor->MutableData()));
return;
} else if (fd_tensor->dtype == FDDataType::INT64) {
tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor->MutableData()));
return;
}
FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
Str(fd_tensor->dtype).c_str());
} else {
paddle_infer::PlaceType place;
int size = 0;
// TODO(liqi): The tensor->data interface of paddle don't return device id
// and don't support return void*.
auto* out_data = tensor->data<uint8_t>(&place, &size);
Device device = Device::CPU;
if(place == paddle_infer::PlaceType::kGPU) {
device = Device::GPU;
}
fd_tensor->SetExternalData(
shape, fd_dtype,
reinterpret_cast<void*>(out_data), device);
}
FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
Str(fd_tensor->dtype).c_str());
}

FDDataType PaddleDataTypeToFD(const paddle_infer::DataType& dtype) {
Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/backends/poros/poros_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ bool PorosBackend::InitFromPoros(const std::string& model_file,
}

bool PorosBackend::Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) {
std::vector<FDTensor>* outputs,
bool copy_to_fd) {
// Convert FD Tensor to PyTorch Tensor
std::vector<torch::jit::IValue> poros_inputs;
bool is_backend_cuda =
Expand Down
4 changes: 3 additions & 1 deletion fastdeploy/backends/poros/poros_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@ class PorosBackend : public BaseBackend {
std::vector<std::vector<FDTensor>>& prewarm_tensors,
const PorosBackendOption& option = PorosBackendOption());

bool Infer(std::vector<FDTensor>& inputs, std::vector<FDTensor>* outputs);
bool Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs,
bool copy_to_fd = true) override;

int NumInputs() const { return _numinputs; }

Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,8 @@ std::vector<TensorInfo> RKNPU2Backend::GetOutputInfos() {
}

bool RKNPU2Backend::Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) {
std::vector<FDTensor>* outputs,
bool copy_to_fd) {
int ret = RKNN_SUCC;
// Judge whether the input and output size are the same
if (inputs.size() != inputs_desc_.size()) {
Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/backends/rknpu/rknpu2/rknpu2_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ class RKNPU2Backend : public BaseBackend {
std::vector<TensorInfo> GetInputInfos() override;
std::vector<TensorInfo> GetOutputInfos() override;
bool Infer(std::vector<FDTensor>& inputs,
std::vector<FDTensor>* outputs) override;
std::vector<FDTensor>* outputs,
bool copy_to_fd = true) override;

private:
// The object of rknn context.
Expand Down
Loading

0 comments on commit 42f1888

Please sign in to comment.