intel · javier-intel · Jun 15, 2025 · Jun 16, 2025 · Jun 23, 2025 · Jun 24, 2025
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
@@ -49,7 +49,7 @@
   endif()
   add_dependencies(onnxruntime_providers_openvino onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${OpenVINO_INCLUDE_DIR} ${OPENVINO_INCLUDE_DIR_LIST} ${PYTHON_INCLUDE_DIRS} $ENV{OPENCL_INCS} $ENV{OPENCL_INCS}/../../cl_headers/)
-  target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen)
+  target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen onnx_proto)
 
   target_compile_definitions(onnxruntime_providers_openvino PRIVATE FILE_NAME=\"onnxruntime_providers_openvino.dll\")
 

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -20,6 +20,7 @@
 #include "core/providers/openvino/ov_interface.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -43,10 +44,9 @@ BackendManager::BackendManager(SessionContext& session_context,
                                                               session_context_(session_context),
                                                               shared_context_{shared_context} {
   subgraph_context_.is_ep_ctx_graph = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(subgraph);
-
-  bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
-                    session_context_.device_type.find("GPU") != std::string::npos;
-  bool npu = session_context_.device_type.find("NPU") != std::string::npos;
+  // If the graph contains a OVIR wrapped node, we check if it has matching xml file name attribute
+  subgraph_context_.is_ep_ctx_ovir_encapsulated = ep_ctx_handle_.CheckEPCacheContextAttribute(subgraph,
+                    session_context_.onnx_model_path_name.filename().replace_extension("xml").string());
 
   subgraph_context_.model_precision = [&](const GraphViewer& graph_viewer) {
     // return empty if graph has no inputs or if types are not one of FP32/FP16
@@ -112,8 +112,7 @@ BackendManager::BackendManager(SessionContext& session_context,
   if (ModelHasSymbolicInputDims(subgraph)) {
     subgraph_context_.has_dynamic_input_shape = true;
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
-    if (cpu_or_gpu || (npu && session_context_.enable_causallm) &&
-                          !session_context_.disable_dynamic_shapes) {
+    if (!session_context_.disable_dynamic_shapes) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
                          << "Creating backend Dynamic Shapes";
       try {
@@ -197,9 +196,10 @@ BackendManager::BackendManager(SessionContext& session_context,
       }
     }
   }
-  if (session_context_.so_context_enable && !subgraph_context_.is_ep_ctx_graph) {
+  if (session_context_.so_context_enable &&
+    (subgraph_context_.is_ep_ctx_ovir_encapsulated || !subgraph_context_.is_ep_ctx_graph)) {
     auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph);
-    if ((!status.IsOK())) {
+    if (!status.IsOK()) {
       ORT_THROW(status);
     }
   }
@@ -447,8 +447,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
 
   const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU and experimentally on the GPU
-  if ((session_context_.device_type.find("NPU") != std::string::npos ||
-       session_context_.device_type.find("GPU") != std::string::npos) &&
+  if ((session_context_.device_type.find("NPU") != std::string::npos) &&
       (enable_ovep_qdq_optimizer || session_context_.so_share_ep_contexts)) {
     std::unique_ptr<onnxruntime::Model> model;
     Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, enable_ovep_qdq_optimizer, model, shared_context_.shared_weights);
@@ -458,6 +457,17 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
+  } else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
+      enable_ovep_qdq_optimizer) {
+    // Create a copy of the model
+    std::unique_ptr<onnxruntime::Model> model;
+    Status status = qdq_scales_fix::Transform(subgraph, logger, model);
+    auto model_proto = model->ToProto();
+    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+    print_model_proto_duration();
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
+    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+    return model_proto;
   } else {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled";
     auto model = subgraph.CreateModel(logger);
@@ -579,30 +589,34 @@ void BackendManager::ValidateInputShapes(const reshape_t& shapes,
 void BackendManager::Compute(OrtKernelContext* context) {
   Ort::KernelContext ctx(context);
   std::chrono::high_resolution_clock::time_point start_compute, end_compute;
-  bool cpu_or_gpu = session_context_.device_type.find("CPU") != std::string::npos ||
-                    session_context_.device_type.find("GPU") != std::string::npos;
-  bool npu = session_context_.device_type.find("NPU") != std::string::npos;
+
 #ifdef OPENVINO_FIL_ENABLED
   static bool fil_enabled = true;
   if (fil_enabled) {
     start_compute = std::chrono::high_resolution_clock::now();
     LOGS_DEFAULT(INFO) << "Start Compute";
   }
 #endif
-  // OV NPU doesn't support dynamic shaped model inference.
+
   // if disable_dynamic_shapes is set to true then execution of dynamic model is done
   // by rewriting the model to static shaped model at runtime based on input shape.
-  // disable_dynamic_shapes is always set to true for OV NPU plugin.
-  if (subgraph_context_.has_dynamic_input_shape &&
-      !session_context_.disable_dynamic_shapes &&
-      (cpu_or_gpu || (npu && session_context_.enable_causallm))) {
+  // disable_dynamic_shapes should be set for devices that don't support dynamic shapes.
+  bool need_dynamic_backend = subgraph_context_.has_dynamic_input_shape &&
+                              session_context_.disable_dynamic_shapes;
+
+  if (!need_dynamic_backend) {
     concrete_backend_->Infer(context);
-  } else if (subgraph_context_.has_dynamic_input_shape) {
+  } else {
     std::vector<std::vector<int64_t>> tensor_shapes = GetInputTensorShapes(ctx);
     auto key = MakeMapKeyString(tensor_shapes, session_context_.device_type);
     std::shared_ptr<IBackend> dynamic_backend;
-    auto search = backend_map_.find(key);
-    if (search == backend_map_.end()) {
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      dynamic_backend = backend_map_[key];
+    }
+
+    if (!dynamic_backend) {
       ptr_stream_t model_stream;
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
                          << "Creating dynamic backend for key: " << key;
@@ -643,14 +657,11 @@ void BackendManager::Compute(OrtKernelContext* context) {
         }
 #endif
       }
+      std::unique_lock<std::mutex> lock(mutex_);
       backend_map_.insert({key, dynamic_backend});
-    } else {
-      dynamic_backend = search->second;
     }
 
     dynamic_backend->Infer(context);
-  } else {
-    concrete_backend_->Infer(context);
   }
 #ifdef OPENVINO_FIL_ENABLED
   if (fil_enabled) {

diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -54,6 +54,7 @@ class BackendManager {
 
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto_;
   std::shared_ptr<IBackend> concrete_backend_;
+  std::mutex mutex_;
   std::map<std::string, std::shared_ptr<IBackend>> backend_map_;
   SubGraphContext subgraph_context_;
   EPCtxHandler& ep_ctx_handle_;

diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -179,32 +179,6 @@ CreateOVModel(std::string&& model,
   }
 }
 
-Ort::UnownedValue
-GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
-                OVInferRequestPtr infer_request,
-                std::string output_name,
-                const SubGraphContext::string_index_map_t& output_names) {
-  auto graph_output_blob = infer_request->GetTensor(output_name);
-
-  auto graph_output_dims = graph_output_blob->get_shape();
-
-  if (batch_size > 1) {
-    // Add the batch size as dim 0.
-    graph_output_dims.insert(graph_output_dims.begin(), batch_size);
-  }
-  size_t num_dims = graph_output_dims.size();
-  std::unique_ptr<int64_t[]> output_shape(new int64_t[num_dims]);
-  for (size_t j = 0; j < num_dims; j++) {
-    output_shape[j] = static_cast<int64_t>(graph_output_dims[j]);
-  }
-  auto it = output_names.find(output_name);
-  if (it == output_names.end()) {
-    ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
-  }
-  int index = it->second;
-  return context.GetOutput(index, output_shape.get(), num_dims);
-}
-
 Ort::UnownedValue
 GetOutputTensor(Ort::KernelContext& context,
                 std::string output_name,
@@ -220,14 +194,9 @@ GetOutputTensor(Ort::KernelContext& context,
     ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX");
   }
   int index = it->second;
-  auto shape = node->get_shape();
+  auto output_shape = ParameterShape::ToOrtShape(node->get_shape());
 
-  size_t num_dims = shape.size();
-  std::unique_ptr<int64_t[]> output_shape(new int64_t[num_dims]);
-  for (size_t j = 0; j < num_dims; j++) {
-    output_shape[j] = static_cast<int64_t>(shape[j]);
-  }
-  return context.GetOutput(index, output_shape.get(), num_dims);
+  return context.GetOutput(index, output_shape);
 }
 
 int GetFirstAvailableDevice(SessionContext& session_context) {
@@ -312,15 +281,6 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
   std::memcpy(input_data, batch_memory_offset, input_data_size);
 }
 
-void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
-                    size_t batch_slice_idx) {
-  auto output_data = outputBlob->data();
-  size_t output_data_size = outputBlob->get_byte_size();
-  char* tensor_data = output_tensor.GetTensorMutableData<char>();
-  char* batch_memory_offset = tensor_data + output_data_size * batch_slice_idx;
-  std::memcpy(batch_memory_offset, output_data, output_data_size);
-}
-
 void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
                             std::ostream& stream, std::string deviceName) {
   int64_t totalTime = 0;
@@ -440,6 +400,33 @@ void DestroyOVTensors(SharedContext::SharedWeights::Metadata::Map& metadata_map)
   metadata_map.clear();
 }
 
+bool IsModelStreamXML(std::istream& model_stream) {
+  std::streampos originalPos = model_stream.tellg();
+
+  // first, get the total size of model_stream in bytes
+  model_stream.seekg(0, std::ios::end);
+  auto end_pos = model_stream.tellg();
+  //  Restore the stream position
+  model_stream.seekg(originalPos);
+  auto total_size = end_pos - originalPos;
+
+  // Choose 32 bytes to hold content of:
+  // '<?xml version-"1.0"?> <net '
+  const std::streamsize header_check_len = 32;
+  ORT_ENFORCE(total_size > header_check_len);
+
+  // read 32 bytes into header
+  std::string header(header_check_len, '\0');
+  model_stream.read(&header[0], header_check_len);
+  // Clear any read errors
+  model_stream.clear();
+  // Restore the stream position
+  model_stream.seekg(originalPos);
+
+  // return true if the header starts with '<?xml' and also includes '<net '
+  return ((header.rfind("<?xml", 0) == 0) && (header.find("<net ") != std::string::npos));
+}
+
 }  // namespace backend_utils
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -27,8 +27,48 @@
 
 namespace onnxruntime {
 namespace openvino_ep {
+constexpr std::string log_tag = "[OpenVINO-EP] ";
+
+struct ParameterShape {
+  using ort_shape_t = std::vector<int64_t>;
+
+  static ov::PartialShape ToOvPartialShape(const ort_shape_t& ort_shape) {
+    std::vector<ov::Dimension> ov_shape(ort_shape.size());
+    std::transform(ort_shape.begin(), ort_shape.end(), ov_shape.begin(), [](int64_t dim) {
+      return dim == -1 ? ov::Dimension::dynamic() : ov::Dimension(dim);
+    });
+    return ov::PartialShape(ov_shape);
+  }
+
+  static ort_shape_t ToOrtShape(const ov::PartialShape& ov_shape) {
+    ort_shape_t ort_shape(ov_shape.size());
+    std::transform(ov_shape.begin(), ov_shape.end(), ort_shape.begin(), [](const auto& dim) {
+      return dim.is_dynamic() ? -1 : dim.get_length();
+    });
+    return ort_shape;
+  }
+
+  static ort_shape_t ToOrtShape(const ov::Shape& ov_shape) {
+    ort_shape_t ort_shape(ov_shape.size());
+    std::transform(ov_shape.begin(), ov_shape.end(), ort_shape.begin(), [](const auto& dim) {
+      return narrow<int64_t>(dim);
+    });
+    return ort_shape;
+  }
+
+  operator ov::Shape() const { return ov_.get_shape(); }
+  operator const ov::PartialShape&() const { return ov_; }
+  operator const ort_shape_t&() const { return ort_; }
+
+  explicit ParameterShape(const ort_shape_t& ort_shape) : ort_(ort_shape), ov_(ToOvPartialShape(ort_shape)) {}
+  explicit ParameterShape(const ov::PartialShape& ov_partial_shape) : ov_(ov_partial_shape), ort_(ToOrtShape(ov_partial_shape)) {}
+
+ private:
+  ort_shape_t ort_;
+  ov::PartialShape ov_;
+};
+
 namespace backend_utils {
-const std::string log_tag = "[OpenVINO-EP] ";
 
 bool IsDebugEnabled();
 
@@ -48,19 +88,10 @@ GetOutputTensor(Ort::KernelContext& context,
                 const SubGraphContext::string_index_map_t& output_names,
                 std::shared_ptr<ov::Node> node);
 
-Ort::UnownedValue
-GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
-                OVInferRequestPtr infer_request,
-                std::string output_name,
-                const SubGraphContext::string_index_map_t& output_names);
-
 void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
                    std::string input_name, Ort::KernelContext& context,
                    const SubGraphContext& subgraph_context);
 
-void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,
-                    size_t batch_slice_idx);
-
 std::shared_ptr<const OVNetwork>
 CreateOVModel(std::string&& model,
               const SessionContext& session_context,
@@ -76,6 +107,8 @@ void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
 
 void printPerformanceCounts(OVInferRequestPtr request, std::ostream& stream, std::string deviceName);
 
+bool IsModelStreamXML(std::istream& model_stream);
+
 }  // namespace backend_utils
 }  // namespace openvino_ep
 }  // namespace onnxruntime