Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPUW] L0 allocation improvements #27011

Merged
merged 16 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,18 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
// Finalize memory in closures and weight banks
finalize_weights_bank();

// Set allocator if there is at least 1 NPU submodel
for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
auto& comp_model_desc = m_compiled_submodels[idx];
if (!comp_model_desc.compiled_model) {
continue;
}
if (*comp_model_desc.device_it == "NPU") {
m_alloc_required = true;
break;
}
}
dmatveev marked this conversation as resolved.
Show resolved Hide resolved

// Print stats report when possible
{
LOG_INFO("Initial device distribution:");
Expand All @@ -442,13 +454,14 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
}

void ov::npuw::CompiledModel::finalize_weights_bank() {
LOG_INFO("Finalizing weights bank...");
// Register lazy tensors
for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
auto& comp_model_desc = m_compiled_submodels[idx];

// Skip optimized out and non-functions
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
return;
continue;
}

const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
Expand Down Expand Up @@ -489,6 +502,8 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
comp_model_desc.is_remote[tidx] = m_weights_bank->is_remote(lt);
}
}

LOG_INFO("Done.");
}

void ov::npuw::CompiledModel::remove_long_output_names(const std::shared_ptr<ov::Model>& model) {
Expand Down Expand Up @@ -668,7 +683,7 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
return std::make_shared<ov::npuw::JustInferRequest>(this_sptr, m_alloc_required);
}

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ class CompiledModel : public ov::ICompiledModel {

bool m_update_required;

bool m_alloc_required = false;
dmatveev marked this conversation as resolved.
Show resolved Hide resolved

std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
std::string m_ref_device;

Expand Down
48 changes: 36 additions & 12 deletions src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@
#include "openvino/core/except.hpp"
#include "openvino/core/parallel.hpp"
#include "openvino/runtime/iasync_infer_request.hpp"
#include "openvino/runtime/make_tensor.hpp"
#include "plugin.hpp"
#include "util.hpp"
#include "weights_bank.hpp"

ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
: IBaseInferRequest(compiled_model) {
ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model,
bool alloc_required)
dmatveev marked this conversation as resolved.
Show resolved Hide resolved
: IBaseInferRequest(compiled_model),
m_alloc_required(alloc_required) {
m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
if (m_use_function_pipelining) {
LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name
Expand All @@ -49,7 +50,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
// FIXME: Shouldn't this be handled by the base class? (in create_tensor)
// A special case for function calls
if (comp_model_desc.replaced_by) {
// Pre-allocate output tesnors for this function call
// Pre-allocate output tensors for this function call
const auto real_idx = comp_model_desc.replaced_by.value();
auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
auto& proto_comp_model = proto_comp_model_desc.compiled_model;
Expand All @@ -67,14 +68,14 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
// Note: these buffers are allocated to the entire NWAY (> tail_size)
for (auto&& p : proto_comp_model_desc.spatial->params) {
const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
m_spatial_io[real_idx].input_tails[p.idx] =
ov::get_tensor_impl(ov::Tensor(iport.get_element_type(), iport.get_shape()));
m_spatial_io[real_idx].input_tails[p.idx] = ov::get_tensor_impl(
allocTensor(iport.get_element_type(), iport.get_shape(), *proto_comp_model_desc.device_it));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just thinking.. if you're using allocTensor only where we store ITensors, why can't allocTensor return the ITensor so you don't need to call get_tensor_impl everywhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

}
const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
m_spatial_io[real_idx].output_tails[out_idx] =
ov::get_tensor_impl(ov::Tensor(oport.get_element_type(), oport.get_shape()));
m_spatial_io[real_idx].output_tails[out_idx] = ov::get_tensor_impl(
allocTensor(oport.get_element_type(), oport.get_shape(), *proto_comp_model_desc.device_it));
}
}
} // if(spatial)
Expand All @@ -88,7 +89,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
}
m_funcall_result[LinkFrom{i, out_idx}] =
ov::get_tensor_impl(ov::Tensor(port.get_element_type(), shape));
ov::get_tensor_impl(allocTensor(port.get_element_type(), shape, *proto_comp_model_desc.device_it));
}
if (real_idx != i) {
// If this function call is NOT the function body, do nothing here - the original
Expand Down Expand Up @@ -153,7 +154,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
LOG_INFO("Preallocating input tensors...");
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& port = m_npuw_model->inputs()[i];
m_input_tensors.push_back(ov::get_tensor_impl(ov::Tensor(port.get_element_type(), port.get_shape())));
m_input_tensors.push_back(ov::get_tensor_impl(allocTensor(port.get_element_type(), port.get_shape())));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we need an overload for allocTensor which takes ov::Input<ov::Node> / ov::Output<ov::Node>.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note - you're not passing any device here. And you have ="NPU" as the default parameter..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

About the default device - that was the idea

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
} // for(inputs)

Expand All @@ -173,7 +174,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
const auto& tensor =
funcall_result_iter != m_funcall_result.end()
? funcall_result_iter->second // Function calls have their tensors allocated, so just use one
: ov::get_tensor_impl(ov::Tensor(port.get_element_type(), port.get_shape()));
: ov::get_tensor_impl(allocTensor(port.get_element_type(), port.get_shape()));

m_output_tensors.push_back(tensor);
m_port_to_tensor[port] = TensorStorage{tensor, true};
Expand Down Expand Up @@ -372,7 +373,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
const auto real_idx = comp_model_desc.replaced_by.value_or(idx);

const bool do_copy = needs_copy(idx);
const bool do_copy = !m_alloc_required && needs_copy(idx);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it's correct

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we check global parameters. The idea is that global parameters allocation solely depends on m_alloc_required - if it's set, params will be allocated on NPU

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haven't checked on this yet

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think m_alloc_required is overall misleading. It is always required.

Also, you've missed this in my past L0 PR: dmatveev#5

More precisely, this part: https://github.com/dmatveev/openvino/blob/e7d62f1a4412f639d0fb112e4f5647eeff9a1b8e/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp#L117

And then this part: https://github.com/dmatveev/openvino/blob/e7d62f1a4412f639d0fb112e4f5647eeff9a1b8e/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp#L370

The backstory here is that, even if you've allocated your model-global input tensors yourself, they may be overwritten. Even our scripts carelessly do this, unfortunately. So what you need to do is to keep track of the tensors you allocate (maybe you can just memorize the pointers in your new allocTensor method) and check if the tensors' your working with are still "known" to you.

Once there's a set_tensor call - you loose it and your m_alloc_required flag doesn't tell truth.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, thanks

const auto& iodesc = m_subrequests_gio.at(idx);

const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
Expand Down Expand Up @@ -919,6 +920,29 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
} // if (replaced_by)
}

ov::Tensor ov::npuw::JustInferRequest::allocTensor(const ov::element::Type type,
const ov::Shape& shape,
const std::string& device) {
if (!m_alloc_required || device == "CPU") {
return ov::Tensor(type, shape);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this and having device="NPU" by default is looks a very obscure way to allocate the tensors in the right region.

So you're using this method in tree contexts:

  1. For function results - where device is taken into account
  2. For global inputs, where device is not passed and defaults to "NPU" (meh)
  3. For global results, same?

Also the logic completely discards the WEIGHTS_BANK_ALLOC setting we have here. OK, this is not weights, but that flag exists for a reason you know. Why do we think we bypass that problem here?

A clearer way how it could've been done:

  1. No default arguments here - just always pass the device for yourself
  2. Add two methods to the CompiledModel so frame the decision making logic:
    • std::string global_mem_device()
    • std::string funcall_mem_device(idx)

The first only takes device distribution into account.
The second takes the subgraph affinity into account (you can access it).
BOTH should also take the BANK_ALLOC into account if that's set.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

if (std::any_of(shape.begin(), shape.end(), [&](std::size_t dim) {
return dim == 0;
})) {
return ov::Tensor(type, shape);
}
dmatveev marked this conversation as resolved.
Show resolved Hide resolved

ov::SoPtr<ov::ITensor> remote_tensor;
ov::Tensor allocated_tensor;
{
std::lock_guard<std::mutex> guard(m_alloc_mutex);
m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
remote_tensor = m_remote_ctx->create_host_tensor(type, shape);
allocated_tensor = ov::make_tensor(remote_tensor);
Comment on lines +932 to +936
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you need a mutex here? you call allocTensor from multiple threads?

Copy link
Contributor Author

@smirnov-alexey smirnov-alexey Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just in case since we do guard allocation in banks. Less error prone in the future

}
return allocated_tensor;
}

void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) {
get_real_subrequest(idx)->set_callback(std::move(cb));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,15 @@

#include <limits>
#include <map>
#include <mutex>
#include <optional>
#include <vector>

#include "base_sync_infer_request.hpp"
#include "openvino/runtime/iplugin.hpp"
#include "openvino/runtime/iremote_context.hpp"
#include "openvino/runtime/make_tensor.hpp"
#include "openvino/runtime/tensor.hpp"

namespace ov {
namespace npuw {
Expand All @@ -19,7 +24,7 @@ class AsyncInferRequest;

class JustInferRequest final : public IBaseInferRequest {
public:
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model, bool alloc_required);

// Query APIs
std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
Expand Down Expand Up @@ -59,6 +64,8 @@ class JustInferRequest final : public IBaseInferRequest {
void connect_subrequests();
void recreate_subrequests(std::size_t idx);

ov::Tensor allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device = "NPU");
dmatveev marked this conversation as resolved.
Show resolved Hide resolved

using LinkFrom = std::pair<std::size_t /* Subrequest index */
,
std::size_t /* Subrequest output index */
Expand Down Expand Up @@ -93,6 +100,10 @@ class JustInferRequest final : public IBaseInferRequest {
map_t global_results; // result idx -> output idx
};
std::vector<GlobalIO> m_subrequests_gio;

bool m_alloc_required;
std::mutex m_alloc_mutex;
std::shared_ptr<ov::IRemoteContext> m_remote_ctx = nullptr;
};

} // namespace npuw
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ std::vector<Avoid> getAvoids(::intel_npu::Config& cfg) {

std::string avoids_opt = cfg.getString<::intel_npu::NPUW_ONLINE_AVOID>();
if (avoids_opt.empty()) {
LOG_WARN(::intel_npu::NPUW_ONLINE_AVOID().key()
<< " property is not set! NPU device will be prioritized for every subgraph.");
LOG_VERB(::intel_npu::NPUW_ONLINE_AVOID().key()
<< " property is not set. NPU device will be prioritized for every subgraph.");
return {};
}

Expand Down
17 changes: 10 additions & 7 deletions src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,16 @@ ov::Tensor Bank::unsafe_eval_and_alloc(const LazyTensor& tensor, const std::stri
return transformed_tensor;
}

// FIXME: L0 allocation may crash when run in parallel
std::lock_guard<std::mutex> guard(m_alloc_mutex);

m_remote_ctx = m_core->get_default_context(device_for_alloc)._ptr;
auto remote_tensor =
m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape());
auto allocated_tensor = ov::make_tensor(remote_tensor);
ov::SoPtr<ov::ITensor> remote_tensor;
ov::Tensor allocated_tensor;
{
// FIXME: L0 allocation may crash when run in parallel
std::lock_guard<std::mutex> guard(m_alloc_mutex);
m_remote_ctx = m_core->get_default_context(device_for_alloc)._ptr;
remote_tensor =
m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape());
allocated_tensor = ov::make_tensor(remote_tensor);
}
transformed_tensor.copy_to(allocated_tensor);
m_device_bank[device_for_alloc][tensor] = allocated_tensor;
return allocated_tensor;
Expand Down
Loading