Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPUW] L0 allocation improvements #27011

Merged
merged 16 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@
#include "compiled_model.hpp"
#include "intel_npu/al/config/npuw.hpp"
#include "logging.hpp"
#include "openvino/runtime/iplugin.hpp"
#include "openvino/runtime/iremote_context.hpp"
#include "util.hpp"

ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model,
bool alloc_required)
: ov::ISyncInferRequest(compiled_model),
m_npuw_model(compiled_model),
m_num_submodels(m_npuw_model->m_compiled_submodels.size()) {
m_num_submodels(m_npuw_model->m_compiled_submodels.size()),
m_alloc_required(alloc_required) {
m_subrequests.resize(m_num_submodels, {});
m_subrequest_devices.resize(m_num_submodels, {});
m_completion_cbs.resize(m_num_submodels, {});
Expand All @@ -21,6 +25,17 @@ ov::npuw::IBaseInferRequest::IBaseInferRequest(const std::shared_ptr<ov::npuw::C
}
}

ov::Tensor ov::npuw::IBaseInferRequest::allocTensor(const ov::element::Type type,
const ov::Shape& shape,
const std::string& device) {
if (!m_alloc_required || device == "CPU") {
return ov::Tensor(type, shape);
}
static const auto& m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
std::lock_guard<std::mutex> guard(m_alloc_mutex);
return ov::make_tensor(m_remote_ctx->create_host_tensor(type, shape));
}
smirnov-alexey marked this conversation as resolved.
Show resolved Hide resolved

ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_requests(std::size_t id,
std::size_t nireq,
bool* recompiled) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <functional>
#include <map>
#include <memory>
#include <mutex>
#include <optional>
#include <string>
#include <vector>
Expand All @@ -25,7 +26,7 @@ class CompiledModel;
// individual subrequests' execution
class IBaseInferRequest : public ov::ISyncInferRequest {
public:
explicit IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>&);
explicit IBaseInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>&, bool);

// Execution API - explicitly "finalize" the infer() here
void infer() override; // final - not final yet
Expand All @@ -40,6 +41,8 @@ class IBaseInferRequest : public ov::ISyncInferRequest {

void check_tensors() const override;

ov::Tensor allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device = "NPU");

using sptr = std::shared_ptr<IBaseInferRequest>;
using Completed = std::function<void(std::exception_ptr)>;

Expand Down Expand Up @@ -109,6 +112,9 @@ class IBaseInferRequest : public ov::ISyncInferRequest {

const std::size_t m_num_submodels;

bool m_alloc_required;
std::mutex m_alloc_mutex;

void dump_input_tensors(std::size_t idx);
void dump_output_tensors(std::size_t idx);

Expand Down
16 changes: 14 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,18 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
// Finalize memory in closures and weight banks
finalize_weights_bank();

// Set allocator if there is at least 1 NPU submodel
for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
auto& comp_model_desc = m_compiled_submodels[idx];
if (!comp_model_desc.compiled_model) {
continue;
}
if (*comp_model_desc.device_it == "NPU") {
m_alloc_required = true;
break;
}
}
dmatveev marked this conversation as resolved.
Show resolved Hide resolved

// Print stats report when possible
{
LOG_INFO("Initial device distribution:");
Expand All @@ -448,7 +460,7 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {

// Skip optimized out and non-functions
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
return;
continue;
}

const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
Expand Down Expand Up @@ -668,7 +680,7 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
return std::make_shared<ov::npuw::JustInferRequest>(this_sptr, m_alloc_required);
}

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ class CompiledModel : public ov::ICompiledModel {

bool m_update_required;

bool m_alloc_required = false;
dmatveev marked this conversation as resolved.
Show resolved Hide resolved

std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
std::string m_ref_device;

Expand Down
23 changes: 12 additions & 11 deletions src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@
#include "util.hpp"
#include "weights_bank.hpp"

ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
: IBaseInferRequest(compiled_model) {
ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model,
bool alloc_required)
dmatveev marked this conversation as resolved.
Show resolved Hide resolved
: IBaseInferRequest(compiled_model, alloc_required) {
m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
if (m_use_function_pipelining) {
LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name
Expand All @@ -49,7 +50,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
// FIXME: Shouldn't this be handled by the base class? (in create_tensor)
// A special case for function calls
if (comp_model_desc.replaced_by) {
// Pre-allocate output tesnors for this function call
// Pre-allocate output tensors for this function call
const auto real_idx = comp_model_desc.replaced_by.value();
auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
auto& proto_comp_model = proto_comp_model_desc.compiled_model;
Expand All @@ -67,14 +68,14 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
// Note: these buffers are allocated to the entire NWAY (> tail_size)
for (auto&& p : proto_comp_model_desc.spatial->params) {
const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
m_spatial_io[real_idx].input_tails[p.idx] =
ov::get_tensor_impl(ov::Tensor(iport.get_element_type(), iport.get_shape()));
m_spatial_io[real_idx].input_tails[p.idx] = ov::get_tensor_impl(
allocTensor(iport.get_element_type(), iport.get_shape(), *proto_comp_model_desc.device_it));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just thinking.. if you're using allocTensor only where we store ITensors, why can't allocTensor return the ITensor so you don't need to call get_tensor_impl everywhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

}
const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
m_spatial_io[real_idx].output_tails[out_idx] =
ov::get_tensor_impl(ov::Tensor(oport.get_element_type(), oport.get_shape()));
m_spatial_io[real_idx].output_tails[out_idx] = ov::get_tensor_impl(
allocTensor(oport.get_element_type(), oport.get_shape(), *proto_comp_model_desc.device_it));
}
}
} // if(spatial)
Expand All @@ -88,7 +89,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
}
m_funcall_result[LinkFrom{i, out_idx}] =
ov::get_tensor_impl(ov::Tensor(port.get_element_type(), shape));
ov::get_tensor_impl(allocTensor(port.get_element_type(), shape, *proto_comp_model_desc.device_it));
}
if (real_idx != i) {
// If this function call is NOT the function body, do nothing here - the original
Expand Down Expand Up @@ -153,7 +154,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
LOG_INFO("Preallocating input tensors...");
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& port = m_npuw_model->inputs()[i];
m_input_tensors.push_back(ov::get_tensor_impl(ov::Tensor(port.get_element_type(), port.get_shape())));
m_input_tensors.push_back(ov::get_tensor_impl(allocTensor(port.get_element_type(), port.get_shape())));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we need an overload for allocTensor which takes ov::Input<ov::Node> / ov::Output<ov::Node>.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note - you're not passing any device here. And you have ="NPU" as the default parameter..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

About the default device - that was the idea

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
} // for(inputs)

Expand All @@ -173,7 +174,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
const auto& tensor =
funcall_result_iter != m_funcall_result.end()
? funcall_result_iter->second // Function calls have their tensors allocated, so just use one
: ov::get_tensor_impl(ov::Tensor(port.get_element_type(), port.get_shape()));
: ov::get_tensor_impl(allocTensor(port.get_element_type(), port.get_shape()));

m_output_tensors.push_back(tensor);
m_port_to_tensor[port] = TensorStorage{tensor, true};
Expand Down Expand Up @@ -372,7 +373,7 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) {
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
const auto real_idx = comp_model_desc.replaced_by.value_or(idx);

const bool do_copy = needs_copy(idx);
const bool do_copy = !m_alloc_required && needs_copy(idx);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it's correct

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we check global parameters. The idea is that global parameters allocation solely depends on m_alloc_required - if it's set, params will be allocated on NPU

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

haven't checked on this yet

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think m_alloc_required is overall misleading. It is always required.

Also, you've missed this in my past L0 PR: dmatveev#5

More precisely, this part: https://github.com/dmatveev/openvino/blob/e7d62f1a4412f639d0fb112e4f5647eeff9a1b8e/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp#L117

And then this part: https://github.com/dmatveev/openvino/blob/e7d62f1a4412f639d0fb112e4f5647eeff9a1b8e/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp#L370

The backstory here is that, even if you've allocated your model-global input tensors yourself, they may be overwritten. Even our scripts carelessly do this, unfortunately. So what you need to do is to keep track of the tensors you allocate (maybe you can just memorize the pointers in your new allocTensor method) and check if the tensors' your working with are still "known" to you.

Once there's a set_tensor call - you loose it and your m_alloc_required flag doesn't tell truth.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, thanks

const auto& iodesc = m_subrequests_gio.at(idx);

const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class AsyncInferRequest;

class JustInferRequest final : public IBaseInferRequest {
public:
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model, bool alloc_required);

// Query APIs
std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ std::vector<Avoid> getAvoids(::intel_npu::Config& cfg) {

std::string avoids_opt = cfg.getString<::intel_npu::NPUW_ONLINE_AVOID>();
if (avoids_opt.empty()) {
LOG_WARN(::intel_npu::NPUW_ONLINE_AVOID().key()
<< " property is not set! NPU device will be prioritized for every subgraph.");
LOG_VERB(::intel_npu::NPUW_ONLINE_AVOID().key()
<< " property is not set. NPU device will be prioritized for every subgraph.");
return {};
}

Expand Down
12 changes: 7 additions & 5 deletions src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,14 @@ ov::Tensor Bank::unsafe_eval_and_alloc(const LazyTensor& tensor, const std::stri
return transformed_tensor;
}

// FIXME: L0 allocation may crash when run in parallel
std::lock_guard<std::mutex> guard(m_alloc_mutex);

m_remote_ctx = m_core->get_default_context(device_for_alloc)._ptr;
auto remote_tensor =
m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape());
ov::SoPtr<ov::ITensor> remote_tensor;
{
// FIXME: L0 allocation may crash when run in parallel
std::lock_guard<std::mutex> guard(m_alloc_mutex);
remote_tensor =
m_remote_ctx->create_host_tensor(transformed_tensor.get_element_type(), transformed_tensor.get_shape());
}
auto allocated_tensor = ov::make_tensor(remote_tensor);
transformed_tensor.copy_to(allocated_tensor);
m_device_bank[device_for_alloc][tensor] = allocated_tensor;
Expand Down
Loading