openvinotoolkit · dmatveev · Oct 14, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
@@ -7,12 +7,16 @@
 #include "compiled_model.hpp"
 #include "intel_npu/al/config/npuw.hpp"
 #include "logging.hpp"
+#include "openvino/runtime/iplugin.hpp"
+#include "openvino/runtime/iremote_context.hpp"
 #include "util.hpp"
 
-ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
+ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model,
+                                               bool alloc_required)
     : ov::ISyncInferRequest(compiled_model),
       m_npuw_model(compiled_model),
-      m_num_submodels(m_npuw_model->m_compiled_submodels.size()) {
+      m_num_submodels(m_npuw_model->m_compiled_submodels.size()),
+      m_alloc_required(alloc_required) {
     m_subrequests.resize(m_num_submodels, {});
     m_subrequest_devices.resize(m_num_submodels, {});
     m_completion_cbs.resize(m_num_submodels, {});
@@ -21,6 +25,17 @@ ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::C
     }
 }
 
+ov::Tensor ov::npuw::IBaseInferRequest::allocTensor(const ov::element::Type type,
+                                                    const ov::Shape& shape,
+                                                    const std::string& device) {
+    if (!m_alloc_required || device == "CPU") {
+        return ov::Tensor(type, shape);
+    }
+    static const auto& m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
+    std::lock_guard<std::mutex> guard(m_alloc_mutex);
+    return ov::make_tensor(m_remote_ctx->create_host_tensor(type, shape));
+}
+
 ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_requests(std::size_t id,
                                                                                        std::size_t nireq,
                                                                                        bool* recompiled) {

diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
@@ -7,6 +7,7 @@
 #include <functional>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <optional>
 #include <string>
 #include <vector>
@@ -25,7 +26,7 @@ class CompiledModel;
 // individual subrequests' execution
 class IBaseInferRequest : public ov::ISyncInferRequest {
 public:
-    explicit IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>&);
+    explicit IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>&, bool);
 
     // Execution API - explicitly "finalize" the infer() here
     void infer() override;  // final - not final yet
@@ -40,6 +41,8 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
 
     void check_tensors() const override;
 
+    ov::Tensor allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device = "NPU");
+
     using sptr = std::shared_ptr<IBaseInferRequest>;
     using Completed = std::function<void(std::exception_ptr)>;
 
@@ -109,6 +112,9 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
 
     const std::size_t m_num_submodels;
 
+    bool m_alloc_required;
+    std::mutex m_alloc_mutex;
+
     void dump_input_tensors(std::size_t idx);
     void dump_output_tensors(std::size_t idx);
 

@@ -428,6 +428,18 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
     // Finalize memory in closures and weight banks
     finalize_weights_bank();
 
+    // Set allocator if there is at least 1 NPU submodel
+    for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
+        auto& comp_model_desc = m_compiled_submodels[idx];
+        if (!comp_model_desc.compiled_model) {
+            continue;
+        }
+        if (*comp_model_desc.device_it == "NPU") {
+            m_alloc_required = true;
+            break;
+        }
+    }
+
     // Print stats report when possible
     {
         LOG_INFO("Initial device distribution:");
@@ -448,7 +460,7 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
 
         // Skip optimized out and non-functions
         if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
-            return;
+            continue;
         }
 
         const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
@@ -668,7 +680,7 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de
 
 std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
     auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
-    return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
+    return std::make_shared<ov::npuw::JustInferRequest>(this_sptr, m_alloc_required);
 }
 
 std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {

@@ -157,6 +157,8 @@ class CompiledModel : public ov::ICompiledModel {
 
     bool m_update_required;
 
+    bool m_alloc_required = false;
+
     std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
     std::string m_ref_device;
 

@@ -21,8 +21,9 @@
 #include "util.hpp"
 #include "weights_bank.hpp"
 
-ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
-    : IBaseInferRequest(compiled_model) {
+ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model,
+                                             bool alloc_required)
+    : IBaseInferRequest(compiled_model, alloc_required) {
     m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
     if (m_use_function_pipelining) {
         LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name
@@ -49,7 +50,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         // FIXME: Shouldn't this be handled by the base class? (in create_tensor)
         // A special case for function calls
         if (comp_model_desc.replaced_by) {
-            // Pre-allocate output tesnors for this function call
+            // Pre-allocate output tensors for this function call
             const auto real_idx = comp_model_desc.replaced_by.value();
             auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
             auto& proto_comp_model = proto_comp_model_desc.compiled_model;
@@ -67,14 +68,14 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
                     // Note: these buffers are allocated to the entire NWAY (> tail_size)
                     for (auto&& p : proto_comp_model_desc.spatial->params) {
                         const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
-                        m_spatial_io[real_idx].input_tails[p.idx] =
-                            ov::get_tensor_impl(ov::Tensor(iport.get_element_type(), iport.get_shape()));
+                        m_spatial_io[real_idx].input_tails[p.idx] = ov::get_tensor_impl(
+                            allocTensor(iport.get_element_type(), iport.get_shape(), *proto_comp_model_desc.device_it));
                     }
                     const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
                     for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
                         const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
-                        m_spatial_io[real_idx].output_tails[out_idx] =
-                            ov::get_tensor_impl(ov::Tensor(oport.get_element_type(), oport.get_shape()));
+                        m_spatial_io[real_idx].output_tails[out_idx] = ov::get_tensor_impl(
+                            allocTensor(oport.get_element_type(), oport.get_shape(), *proto_comp_model_desc.device_it));
                     }
                 }
             }  // if(spatial)
@@ -88,7 +89,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
                     shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
                 }
                 m_funcall_result[LinkFrom{i, out_idx}] =
-                    ov::get_tensor_impl(ov::Tensor(port.get_element_type(), shape));
+                    ov::get_tensor_impl(allocTensor(port.get_element_type(), shape, *proto_comp_model_desc.device_it));
             }
             if (real_idx != i) {
                 // If this function call is NOT the function body, do nothing here - the original
@@ -153,7 +154,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     LOG_INFO("Preallocating input tensors...");
     for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
         const auto& port = m_npuw_model->inputs()[i];
-        m_input_tensors.push_back(ov::get_tensor_impl(ov::Tensor(port.get_element_type(), port.get_shape())));
+        m_input_tensors.push_back(ov::get_tensor_impl(allocTensor(port.get_element_type(), port.get_shape())));
         m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
     }  // for(inputs)
 
@@ -173,7 +174,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         const auto& tensor =
             funcall_result_iter != m_funcall_result.end()
                 ? funcall_result_iter->second  // Function calls have their tensors allocated, so just use one
-                : ov::get_tensor_impl(ov::Tensor(port.get_element_type(), port.get_shape()));
+                : ov::get_tensor_impl(allocTensor(port.get_element_type(), port.get_shape()));
 
         m_output_tensors.push_back(tensor);
         m_port_to_tensor[port] = TensorStorage{tensor, true};
@@ -372,7 +373,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
 
-    const bool do_copy = needs_copy(idx);
+    const bool do_copy = !m_alloc_required && needs_copy(idx);
     const auto& iodesc = m_subrequests_gio.at(idx);
 
     const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];

@@ -19,7 +19,7 @@ class AsyncInferRequest;
 
 class JustInferRequest final : public IBaseInferRequest {
 public:
-    explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
+    explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model, bool alloc_required);
 
     // Query APIs
     std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;

@@ -74,8 +74,8 @@ std::vector<Avoid> getAvoids(::intel_npu::Config& cfg) {
 
     std::string avoids_opt = cfg.getString<::intel_npu::NPUW_ONLINE_AVOID>();
     if (avoids_opt.empty()) {
-        LOG_WARN(::intel_npu::NPUW_ONLINE_AVOID().key()
-                 << " property is not set! NPU device will be prioritized for every subgraph.");
+        LOG_VERB(::intel_npu::NPUW_ONLINE_AVOID().key()
+                 << " property is not set. NPU device will be prioritized for every subgraph.");
         return {};
     }
 

@@ -95,12 +95,14 @@ ov::Tensor Bank::unsafe_eval_and_alloc(const LazyTensor& tensor, const std::stri
         return transformed_tensor;
     }
 
-    // FIXME: L0 allocation may crash when run in parallel
-    std::lock_guard<std::mutex> guard(m_alloc_mutex);
-
     m_remote_ctx = m_core->get_default_context(device_for_alloc)._ptr;
-    auto remote_tensor =
-        m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape());
+    ov::SoPtr<ov::ITensor> remote_tensor;
+    {
+        // FIXME: L0 allocation may crash when run in parallel
+        std::lock_guard<std::mutex> guard(m_alloc_mutex);
+        remote_tensor =
+            m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape());
+    }
     auto allocated_tensor = ov::make_tensor(remote_tensor);
     transformed_tensor.copy_to(allocated_tensor);
     m_device_bank[device_for_alloc][tensor] = allocated_tensor;