Skip to content

Commit

Permalink
[ARM CPU] New version of scheduler enabling (openvinotoolkit#20488)
Browse files Browse the repository at this point in the history
### Tickets:
 - CVS-132497
 - CVS-129036
 - CVS-134932
  • Loading branch information
allnes authored Mar 27, 2024
1 parent e90b7ea commit dc2416c
Show file tree
Hide file tree
Showing 15 changed files with 53 additions and 96 deletions.
11 changes: 11 additions & 0 deletions src/plugins/intel_cpu/src/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
#include <cstring>
#include <utility>

#if defined(OV_CPU_WITH_ACL)
#include "nodes/executors/acl/acl_ie_scheduler.hpp"
#endif

using namespace ov::threading;

namespace ov {
Expand Down Expand Up @@ -82,6 +86,13 @@ CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
do {
for (auto&& task : tasks) {
task = [this] {
#if defined(OV_CPU_WITH_ACL)
static std::once_flag flag_once;
std::call_once(flag_once, [&]() {
std::shared_ptr<arm_compute::IScheduler> acl_scheduler = std::make_shared<ACLScheduler>();
arm_compute::Scheduler::set(std::static_pointer_cast<arm_compute::IScheduler>(acl_scheduler));
});
#endif
CompiledModel::get_graph();
};
}
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ bool ACLConvertExecutor::init(const ConvertParams& convertParams,

if (isCopyOp) {
acl_copy = std::make_unique<NECopy>();
acl_copy->configure(&srcTensor, &dstTensor);
configureThreadSafe([&] { acl_copy->configure(&srcTensor, &dstTensor); });
} else {
acl_cast = std::make_unique<NECast>();
acl_cast->configure(&srcTensor, &dstTensor, ConvertPolicy::SATURATE);
configureThreadSafe([&] { acl_cast->configure(&srcTensor, &dstTensor, ConvertPolicy::SATURATE); });
}
return true;
}
Expand Down
5 changes: 3 additions & 2 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,9 @@ bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs,
biasTensor.allocator()->init(biasTensorInfo);

deconv = std::make_unique<arm_compute::NEDeconvolutionLayer>();
deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info);

configureThreadSafe([&] {
deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info);
});
// weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor
weiBuffer = std::vector<float>(srcDescs[1]->getShape().getStaticDims()[0] *
srcDescs[1]->getShape().getStaticDims()[1] *
Expand Down
11 changes: 1 addition & 10 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@ namespace intel_cpu {

using namespace arm_compute;

static std::mutex & get_mtx_ifunc() {
static std::mutex mtx_ifunc;
return mtx_ifunc;
}

inline VectorDims reshape_sizes(VectorDims dims) {
const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS;
VectorDims result_dims(MAX_NUM_SHAPE - 1);
Expand Down Expand Up @@ -524,11 +519,7 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
static_cast<int>(aclEltwiseAttrs.algorithm));
}

// We get a problem (seg. faults, data race etc) for eltwise operations when we use several configure(...) functions in parallel.
// We created issue about this problem here: https://github.com/ARM-software/ComputeLibrary/issues/1073
// TODO: change it when we will get an answer to our question in issue
std::lock_guard<std::mutex> _lock {get_mtx_ifunc()};
ifunc = exec_func();
configureThreadSafe([&] { ifunc = exec_func(); });
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,6 @@ void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const
const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));

if (num_iterations < 1) {
return;
}

std::function<void(const Window &window, const ThreadInfo &info)> main_run;
if (tensors.empty()) {
main_run = [&](const Window &window, const ThreadInfo &info) {
Expand Down
17 changes: 10 additions & 7 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,16 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo
dstTensor.allocator()->init(dstTensorInfo);

acl_scale = std::make_unique<arm_compute::NEScale>();
acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy,
arm_compute::BorderMode::REPLICATE,
arm_compute::PixelValue(),
acl_coord,
false,
aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners,
getAclDataLayoutByMemoryDesc(srcDescs[0])));
configureThreadSafe([&] {
acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy,
arm_compute::BorderMode::REPLICATE,
arm_compute::PixelValue(),
acl_coord,
false,
aclInterpolateAttrs.coordTransMode ==
InterpolateCoordTransMode::align_corners,
getAclDataLayoutByMemoryDesc(srcDescs[0])));
});
return true;
}

Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs,
dstTensor.allocator()->init(dstTensorInfo);

mvn = std::make_unique<arm_compute::NEMeanStdDevNormalizationLayer>();
mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_);
configureThreadSafe([&] { mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_); });

return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs,
};
}
}
ifunc = exec_func();
configureThreadSafe([&] { ifunc = exec_func(); });
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,
default:
OPENVINO_THROW("Unsupported operation type for ACL Reduce executor: ", static_cast<int>(reduceAttrs.operation));
}
ifunc = exec_func();
configureThreadSafe([&] { ifunc = exec_func(); });
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposePar
dstTensor.allocator()->init(dstTensorInfo);

acl_permute = std::make_unique<arm_compute::NEPermute>();
acl_permute->configure(&srcTensor, &dstTensor, order);
configureThreadSafe([&] { acl_permute->configure(&srcTensor, &dstTensor, order); });
return true;
}

Expand Down
13 changes: 13 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_utils.hpp"
#include "support/Mutex.h"

void ov::intel_cpu::configureThreadSafe(const std::function<void(void)>& config) {
// Issue: CVS-123514
static arm_compute::Mutex mtx_config;
arm_compute::lock_guard<arm_compute::Mutex> _lock{mtx_config};
config();
}
10 changes: 9 additions & 1 deletion src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#include "memory_desc/cpu_memory_desc.h"
#include "arm_compute/core/Types.h"
// #include "openvino/core/type/element_type.hpp"

namespace ov {
namespace intel_cpu {

Expand Down Expand Up @@ -109,5 +109,13 @@ inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc)
return arm_compute::DataLayout::UNKNOWN;
}

/**
* @brief run thread-safe configure for ComputeLibrary configuration function.
* Arm Compute Library 23.08 does not officially support thread-safe configure() calls.
* For example, calling configure for Eltwise operations from multiple streams leads to a data race and seg fault.
* @param config ComputeLibrary configuration function
*/
void configureThreadSafe(const std::function<void(void)>& config);

} // namespace intel_cpu
} // namespace ov
39 changes: 0 additions & 39 deletions src/plugins/intel_cpu/src/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@

#include "cpu/x64/cpu_isa_traits.hpp"

#if defined(OV_CPU_WITH_ACL)
# include "arm_compute/runtime/CPP/CPPScheduler.h"
# include "nodes/executors/acl/acl_ie_scheduler.hpp"
#endif

using namespace ov::threading;

namespace ov {
Expand Down Expand Up @@ -127,46 +122,12 @@ class CPUSpecialSetup {
};
#endif // __linux__

#if defined(OV_CPU_WITH_ACL)
std::mutex Plugin::SchedulerGuard::mutex;
std::weak_ptr<Plugin::SchedulerGuard> Plugin::SchedulerGuard::ptr;

Plugin::SchedulerGuard::SchedulerGuard() {
# if OV_THREAD == OV_THREAD_SEQ
// To save state for ACL cores in single-thread mode
arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST);
# else
arm_compute::Scheduler::set(std::make_shared<ACLScheduler>());
# endif
}

std::shared_ptr<Plugin::SchedulerGuard> Plugin::SchedulerGuard::instance() {
std::lock_guard<std::mutex> lock{SchedulerGuard::mutex};
auto scheduler_guard_ptr = SchedulerGuard::ptr.lock();
if (scheduler_guard_ptr == nullptr) {
SchedulerGuard::ptr = scheduler_guard_ptr = std::make_shared<SchedulerGuard>();
}
return scheduler_guard_ptr;
}

Plugin::SchedulerGuard::~SchedulerGuard() {
// To save the state of scheduler after ACLScheduler has been executed
// TODO: find out the cause of the state
std::lock_guard<std::mutex> lock{this->dest_mutex};
if (!arm_compute::Scheduler::is_available(arm_compute::Scheduler::Type::CUSTOM))
arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST);
}
#endif

Plugin::Plugin() : deviceFullName(getDeviceFullName()), specialSetup(new CPUSpecialSetup) {
set_device_name("CPU");
// Initialize Xbyak::util::Cpu object on Pcore for hybrid cores machine
get_executor_manager()->execute_task_by_streams_executor(IStreamsExecutor::Config::PreferredCoreType::BIG, [] {
dnnl::impl::cpu::x64::cpu();
});
#if defined(OV_CPU_WITH_ACL)
scheduler_guard = SchedulerGuard::instance();
#endif
auto& ov_version = ov::get_openvino_version();
m_compiled_model_runtime_properties["OV_VERSION"] = std::string(ov_version.buildNumber);
}
Expand Down
14 changes: 0 additions & 14 deletions src/plugins/intel_cpu/src/plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,6 @@ class Plugin : public ov::IPlugin {
ov::AnyMap m_compiled_model_runtime_properties;

std::shared_ptr<void> specialSetup;

#if defined(OV_CPU_WITH_ACL)
struct SchedulerGuard {
SchedulerGuard();
~SchedulerGuard();
static std::shared_ptr<SchedulerGuard> instance();
static std::mutex mutex;
// separate mutex for saving ACLScheduler state in destructor
mutable std::mutex dest_mutex;
static std::weak_ptr<SchedulerGuard> ptr;
};

std::shared_ptr<SchedulerGuard> scheduler_guard;
#endif
};

} // namespace intel_cpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,19 +300,6 @@ std::vector<std::string> disabledTestPatterns() {
{
retVector.emplace_back(
R"(smoke_CompareWithRefs_static_check_collapsing/EltwiseLayerTest.Inference/IS.*_eltwise_op_type=Div_secondary_input_type=PARAMETER_opType=VECTOR_model_type=i32_InType=undefined_OutType=undefined_trgDev=CPU.*)");
// TODO: enable once streams / tput mode is supported
retVector.emplace_back(
R"(OVClassConfigTestCPU.smoke_CpuExecNetworkCheck(Model|Core)StreamsHasHigherPriorityThanLatencyHint.*)");
retVector.emplace_back(
R"(smoke_BehaviorTests/CorrectConfigCheck.canSetConfigAndCheckGetConfig.*CPU_THROUGHPUT_STREAMS=8.*)");
retVector.emplace_back(
R"(smoke_BehaviorTests/CorrectConfigCheck.canSetConfigTwiceAndCheckGetConfig.*CPU_THROUGHPUT_STREAMS=8.*)");
retVector.emplace_back(
R"(smoke_CPU_OVClassLoadNetworkAndCheckWithSecondaryPropertiesTest/OVClassLoadNetworkAndCheckSecondaryPropertiesTest.LoadNetworkAndCheckSecondaryPropertiesTest.*)");
retVector.emplace_back(
R"(smoke_CPU_OVClassLoadNetworkAndCheckWithSecondaryPropertiesDoubleTest/OVClassLoadNetworkAndCheckSecondaryPropertiesTest.LoadNetworkAndCheckSecondaryPropertiesTest.*)");
retVector.emplace_back(R"(smoke_CPU_OVClassCompileModelAndCheckSecondaryPropertiesTest.*)");
retVector.emplace_back(R"(smoke_CPU_OVClassCompileModelAndCheckWithSecondaryPropertiesDoubleTest.*)");
// Issue: 123321
retVector.emplace_back(
R"(.*smoke_RNNSequenceCommonZeroClip/RNNSequenceTest.Inference.*hidden_size=1.*relu.*direction=reverse.*)");
Expand Down

0 comments on commit dc2416c

Please sign in to comment.