Skip to content

Commit cbef617

Browse files
ericcrawMayureshV1
andauthored
Optimize CPU time spent in inference path (continued) (#695)
* Use infer instead of start async/wait * Introduce OvExeceptionBoundary for exception handling * unbound infer request pool * Fix dynamically sized i/o * Rename onnx->ort + remove unused parameter shape functions * fix linux build issue + review dog comments * more linux build fixes + copilot feedback * disable ReduceSum_noop_axes_input_initializer_opset_18 * review feedback + last minute touch ups * slightly more scalable llm handling * Simplify dynamic shape checks * add missing staged changes * Remove references to IO_BUFFER_ENABLED * Minor tweaks to InferRequestPool * remove unused mem_info * Move ParameterShape and ParameterInfo out of ov_interface --------- Co-authored-by: MayureshV1 <47039074+MayureshV1@users.noreply.github.com>
1 parent 409b224 commit cbef617

File tree

11 files changed

+324
-497
lines changed

11 files changed

+324
-497
lines changed

onnxruntime/core/providers/openvino/backend_manager.cc

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,6 @@ BackendManager::BackendManager(SessionContext& session_context,
4444
shared_context_{shared_context} {
4545
subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph);
4646

47-
bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
48-
session_context_.device_type.find("GPU") != std::string::npos;
49-
bool npu = session_context_.device_type.find("NPU") != std::string::npos;
50-
5147
subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) {
5248
// return empty if graph has no inputs or if types are not one of FP32/FP16
5349
// else assume the type of the first input
@@ -112,8 +108,7 @@ BackendManager::BackendManager(SessionContext& session_context,
112108
if (ModelHasSymbolicInputDims(subgraph)) {
113109
subgraph_context_.has_dynamic_input_shape = true;
114110
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
115-
if (cpu_or_gpu || (npu && session_context_.enable_causallm) &&
116-
!session_context_.disable_dynamic_shapes) {
111+
if (!session_context_.disable_dynamic_shapes) {
117112
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
118113
<< "Creating backend Dynamic Shapes";
119114
try {
@@ -579,30 +574,34 @@ void BackendManager::ValidateInputShapes(const reshape_t& shapes,
579574
void BackendManager::Compute(OrtKernelContext* context) {
580575
Ort::KernelContext ctx(context);
581576
std::chrono::high_resolution_clock::time_point start_compute, end_compute;
582-
bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
583-
session_context_.device_type.find("GPU") != std::string::npos;
584-
bool npu = session_context_.device_type.find("NPU") != std::string::npos;
577+
585578
#ifdef OPENVINO_FIL_ENABLED
586579
static bool fil_enabled = true;
587580
if (fil_enabled) {
588581
start_compute = std::chrono::high_resolution_clock::now();
589582
LOGS_DEFAULT(INFO) << "Start Compute";
590583
}
591584
#endif
592-
// OV NPU doesn't support dynamic shaped model inference.
585+
593586
// if disable_dynamic_shapes is set to true then execution of dynamic model is done
594587
// by rewriting the model to static shaped model at runtime based on input shape.
595-
// disable_dynamic_shapes is always set to true for OV NPU plugin.
596-
if (subgraph_context_.has_dynamic_input_shape &&
597-
!session_context_.disable_dynamic_shapes &&
598-
(cpu_or_gpu || (npu && session_context_.enable_causallm))) {
588+
// disable_dynamic_shapes should be set for devices that don't support dynamic shapes.
589+
bool need_dynamic_backend = subgraph_context_.has_dynamic_input_shape &&
590+
session_context_.disable_dynamic_shapes;
591+
592+
if (!need_dynamic_backend) {
599593
concrete_backend_->Infer(context);
600-
} else if (subgraph_context_.has_dynamic_input_shape) {
594+
} else {
601595
std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
602596
auto key = MakeMapKeyString(tensor_shapes, session_context_.device_type);
603597
std::shared_ptr<IBackend> dynamic_backend;
604-
auto search = backend_map_.find(key);
605-
if (search == backend_map_.end()) {
598+
599+
{
600+
std::unique_lock<std::mutex> lock(mutex_);
601+
dynamic_backend = backend_map_[key];
602+
}
603+
604+
if (!dynamic_backend) {
606605
ptr_stream_t model_stream;
607606
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
608607
<< "Creating dynamic backend for key: " << key;
@@ -643,14 +642,11 @@ void BackendManager::Compute(OrtKernelContext* context) {
643642
}
644643
#endif
645644
}
645+
std::unique_lock<std::mutex> lock(mutex_);
646646
backend_map_.insert({key, dynamic_backend});
647-
} else {
648-
dynamic_backend = search->second;
649647
}
650648

651649
dynamic_backend->Infer(context);
652-
} else {
653-
concrete_backend_->Infer(context);
654650
}
655651
#ifdef OPENVINO_FIL_ENABLED
656652
if (fil_enabled) {

onnxruntime/core/providers/openvino/backend_manager.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ class BackendManager {
5454

5555
std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto_;
5656
std::shared_ptr<IBackend> concrete_backend_;
57+
std::mutex mutex_;
5758
std::map<std::string, std::shared_ptr<IBackend>> backend_map_;
5859
SubGraphContext subgraph_context_;
5960
EPCtxHandler& ep_ctx_handle_;

onnxruntime/core/providers/openvino/backend_utils.cc

Lines changed: 2 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -179,32 +179,6 @@ CreateOVModel(std::string&& model,
179179
}
180180
}
181181

182-
Ort::UnownedValue
183-
GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
184-
OVInferRequestPtr infer_request,
185-
std::string output_name,
186-
const SubGraphContext::string_index_map_t& output_names) {
187-
auto graph_output_blob = infer_request->GetTensor(output_name);
188-
189-
auto graph_output_dims = graph_output_blob->get_shape();
190-
191-
if (batch_size > 1) {
192-
// Add the batch size as dim 0.
193-
graph_output_dims.insert(graph_output_dims.begin(), batch_size);
194-
}
195-
size_t num_dims = graph_output_dims.size();
196-
std::unique_ptr<int64_t[]> output_shape(new int64_t[num_dims]);
197-
for (size_t j = 0; j < num_dims; j++) {
198-
output_shape[j] = static_cast<int64_t>(graph_output_dims[j]);
199-
}
200-
auto it = output_names.find(output_name);
201-
if (it == output_names.end()) {
202-
ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
203-
}
204-
int index = it->second;
205-
return context.GetOutput(index, output_shape.get(), num_dims);
206-
}
207-
208182
Ort::UnownedValue
209183
GetOutputTensor(Ort::KernelContext& context,
210184
std::string output_name,
@@ -220,14 +194,9 @@ GetOutputTensor(Ort::KernelContext& context,
220194
ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
221195
}
222196
int index = it->second;
223-
auto shape = node->get_shape();
197+
auto output_shape = ParameterShape::ToOrtShape(node->get_shape());
224198

225-
size_t num_dims = shape.size();
226-
std::unique_ptr<int64_t[]> output_shape(new int64_t[num_dims]);
227-
for (size_t j = 0; j < num_dims; j++) {
228-
output_shape[j] = static_cast<int64_t>(shape[j]);
229-
}
230-
return context.GetOutput(index, output_shape.get(), num_dims);
199+
return context.GetOutput(index, output_shape);
231200
}
232201

233202
int GetFirstAvailableDevice(SessionContext& session_context) {
@@ -312,15 +281,6 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
312281
std::memcpy(input_data, batch_memory_offset, input_data_size);
313282
}
314283

315-
void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
316-
size_t batch_slice_idx) {
317-
auto output_data = outputBlob->data();
318-
size_t output_data_size = outputBlob->get_byte_size();
319-
char* tensor_data = output_tensor.GetTensorMutableData<char>();
320-
char* batch_memory_offset = tensor_data + output_data_size * batch_slice_idx;
321-
std::memcpy(batch_memory_offset, output_data, output_data_size);
322-
}
323-
324284
void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
325285
std::ostream& stream, std::string deviceName) {
326286
int64_t totalTime = 0;

onnxruntime/core/providers/openvino/backend_utils.h

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,48 @@
2727

2828
namespace onnxruntime {
2929
namespace openvino_ep {
30+
constexpr std::string log_tag = "[OpenVINO-EP] ";
31+
32+
struct ParameterShape {
33+
using ort_shape_t = std::vector<int64_t>;
34+
35+
static ov::PartialShape ToOvPartialShape(const ort_shape_t& ort_shape) {
36+
std::vector<ov::Dimension> ov_shape(ort_shape.size());
37+
std::transform(ort_shape.begin(), ort_shape.end(), ov_shape.begin(), [](int64_t dim) {
38+
return dim == -1 ? ov::Dimension::dynamic() : ov::Dimension(dim);
39+
});
40+
return ov::PartialShape(ov_shape);
41+
}
42+
43+
static ort_shape_t ToOrtShape(const ov::PartialShape& ov_shape) {
44+
ort_shape_t ort_shape(ov_shape.size());
45+
std::transform(ov_shape.begin(), ov_shape.end(), ort_shape.begin(), [](const auto& dim) {
46+
return dim.is_dynamic() ? -1 : dim.get_length();
47+
});
48+
return ort_shape;
49+
}
50+
51+
static ort_shape_t ToOrtShape(const ov::Shape& ov_shape) {
52+
ort_shape_t ort_shape(ov_shape.size());
53+
std::transform(ov_shape.begin(), ov_shape.end(), ort_shape.begin(), [](const auto& dim) {
54+
return narrow<int64_t>(dim);
55+
});
56+
return ort_shape;
57+
}
58+
59+
operator ov::Shape() const { return ov_.get_shape(); }
60+
operator const ov::PartialShape&() const { return ov_; }
61+
operator const ort_shape_t&() const { return ort_; }
62+
63+
explicit ParameterShape(const ort_shape_t& ort_shape) : ort_(ort_shape), ov_(ToOvPartialShape(ort_shape)) {}
64+
explicit ParameterShape(const ov::PartialShape& ov_partial_shape) : ov_(ov_partial_shape), ort_(ToOrtShape(ov_partial_shape)) {}
65+
66+
private:
67+
ort_shape_t ort_;
68+
ov::PartialShape ov_;
69+
};
70+
3071
namespace backend_utils {
31-
const std::string log_tag = "[OpenVINO-EP] ";
3272

3373
bool IsDebugEnabled();
3474

@@ -48,19 +88,10 @@ GetOutputTensor(Ort::KernelContext& context,
4888
const SubGraphContext::string_index_map_t& output_names,
4989
std::shared_ptr<ov::Node> node);
5090

51-
Ort::UnownedValue
52-
GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
53-
OVInferRequestPtr infer_request,
54-
std::string output_name,
55-
const SubGraphContext::string_index_map_t& output_names);
56-
5791
void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
5892
std::string input_name, Ort::KernelContext& context,
5993
const SubGraphContext& subgraph_context);
6094

61-
void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
62-
size_t batch_slice_idx);
63-
6495
std::shared_ptr<const OVNetwork>
6596
CreateOVModel(std::string&& model,
6697
const SessionContext& session_context,

0 commit comments

Comments
 (0)