[ARM CPU] New version of scheduler enabling (openvinotoolkit#20488)

### Tickets: - CVS-132497 - CVS-129036 - CVS-134932
syaifulnizamiphone7 · Mar 27, 2024 · dc2416c · dc2416c
1 parent e90b7ea
commit dc2416c
Show file tree

Hide file tree

Showing 15 changed files with 53 additions and 96 deletions.
diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -23,6 +23,10 @@
 #include <cstring>
 #include <utility>
 
+#if defined(OV_CPU_WITH_ACL)
+#include "nodes/executors/acl/acl_ie_scheduler.hpp"
+#endif
+
 using namespace ov::threading;
 
 namespace ov {
@@ -82,6 +86,13 @@ CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
         do {
             for (auto&& task : tasks) {
                 task = [this] {
+#if defined(OV_CPU_WITH_ACL)
+                    static std::once_flag flag_once;
+                    std::call_once(flag_once, [&]() {
+                        std::shared_ptr<arm_compute::IScheduler> acl_scheduler = std::make_shared<ACLScheduler>();
+                        arm_compute::Scheduler::set(std::static_pointer_cast<arm_compute::IScheduler>(acl_scheduler));
+                    });
+#endif
                     CompiledModel::get_graph();
                 };
             }

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_convert.cpp
@@ -52,10 +52,10 @@ bool ACLConvertExecutor::init(const ConvertParams& convertParams,
 
     if (isCopyOp) {
         acl_copy = std::make_unique<NECopy>();
-        acl_copy->configure(&srcTensor, &dstTensor);
+        configureThreadSafe([&] { acl_copy->configure(&srcTensor, &dstTensor); });
     } else {
         acl_cast = std::make_unique<NECast>();
-        acl_cast->configure(&srcTensor, &dstTensor, ConvertPolicy::SATURATE);
+        configureThreadSafe([&] { acl_cast->configure(&srcTensor, &dstTensor, ConvertPolicy::SATURATE); });
     }
     return true;
 }

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
@@ -80,8 +80,9 @@ bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs,
         biasTensor.allocator()->init(biasTensorInfo);
 
     deconv = std::make_unique<arm_compute::NEDeconvolutionLayer>();
-    deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info);
-
+    configureThreadSafe([&] {
+        deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiasesParam ? &biasTensor : nullptr, &dstTensor, deconv_info);
+    });
     // weights tensor shape is changed because ACL expects [O, I, H, W] tensor while OV uses [I, O, H, W] tensor
      weiBuffer = std::vector<float>(srcDescs[1]->getShape().getStaticDims()[0] *
                                     srcDescs[1]->getShape().getStaticDims()[1] *

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
@@ -11,11 +11,6 @@ namespace intel_cpu {
 
 using namespace arm_compute;
 
-static std::mutex & get_mtx_ifunc() {
-    static std::mutex mtx_ifunc;
-    return mtx_ifunc;
-}
-
 inline VectorDims reshape_sizes(VectorDims dims) {
     const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS;
     VectorDims result_dims(MAX_NUM_SHAPE - 1);
@@ -524,11 +519,7 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
                            static_cast<int>(aclEltwiseAttrs.algorithm));
     }
 
-    // We get a problem (seg. faults, data race etc) for eltwise operations when we use several configure(...) functions in parallel.
-    // We created issue about this problem here: https://github.com/ARM-software/ComputeLibrary/issues/1073
-    // TODO: change it when we will get an answer to our question in issue
-    std::lock_guard<std::mutex> _lock {get_mtx_ifunc()};
-    ifunc = exec_func();
+    configureThreadSafe([&] { ifunc = exec_func(); });
     return true;
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp
@@ -27,10 +27,6 @@ void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const
     const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
     const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
 
-    if (num_iterations < 1) {
-        return;
-    }
-
     std::function<void(const Window &window, const ThreadInfo &info)> main_run;
     if (tensors.empty()) {
         main_run = [&](const Window &window, const ThreadInfo &info) {

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_interpolate.cpp
@@ -76,13 +76,16 @@ bool ov::intel_cpu::ACLInterpolateExecutor::init(const InterpolateAttrs &interpo
     dstTensor.allocator()->init(dstTensorInfo);
 
     acl_scale = std::make_unique<arm_compute::NEScale>();
-    acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy,
-                                                                              arm_compute::BorderMode::REPLICATE,
-                                                                              arm_compute::PixelValue(),
-                                                                              acl_coord,
-                                                                              false,
-                                                                              aclInterpolateAttrs.coordTransMode == InterpolateCoordTransMode::align_corners,
-                                                                              getAclDataLayoutByMemoryDesc(srcDescs[0])));
+    configureThreadSafe([&] {
+        acl_scale->configure(&srcTensor, &dstTensor, arm_compute::ScaleKernelInfo(acl_policy,
+                                                                                  arm_compute::BorderMode::REPLICATE,
+                                                                                  arm_compute::PixelValue(),
+                                                                                  acl_coord,
+                                                                                  false,
+                                                                                  aclInterpolateAttrs.coordTransMode ==
+                                                                                  InterpolateCoordTransMode::align_corners,
+                                                                                  getAclDataLayoutByMemoryDesc(srcDescs[0])));
+    });
     return true;
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp
@@ -57,7 +57,7 @@ bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs,
     dstTensor.allocator()->init(dstTensorInfo);
 
     mvn = std::make_unique<arm_compute::NEMeanStdDevNormalizationLayer>();
-    mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_);
+    configureThreadSafe([&] { mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_); });
 
     return true;
 }

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp
@@ -186,7 +186,7 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs,
             };
         }
     }
-    ifunc = exec_func();
+    configureThreadSafe([&] { ifunc = exec_func(); });
     return true;
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_reduce.cpp
@@ -88,7 +88,7 @@ bool AclReduceExecutor::init(const ReduceAttrs& reduceAttrs,
         default:
             OPENVINO_THROW("Unsupported operation type for ACL Reduce executor: ", static_cast<int>(reduceAttrs.operation));
     }
-    ifunc = exec_func();
+    configureThreadSafe([&] { ifunc = exec_func(); });
     return true;
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_transpose.cpp
@@ -51,7 +51,7 @@ bool ov::intel_cpu::ACLTransposeExecutor::init(const ov::intel_cpu::TransposePar
     dstTensor.allocator()->init(dstTensorInfo);
 
     acl_permute = std::make_unique<arm_compute::NEPermute>();
-    acl_permute->configure(&srcTensor, &dstTensor, order);
+    configureThreadSafe([&] { acl_permute->configure(&srcTensor, &dstTensor, order); });
     return true;
 }
 

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.cpp
@@ -0,0 +1,13 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_utils.hpp"
+#include "support/Mutex.h"
+
+void ov::intel_cpu::configureThreadSafe(const std::function<void(void)>& config) {
+    // Issue: CVS-123514
+    static arm_compute::Mutex mtx_config;
+    arm_compute::lock_guard<arm_compute::Mutex> _lock{mtx_config};
+    config();
+}
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
@@ -5,7 +5,7 @@
 
 #include "memory_desc/cpu_memory_desc.h"
 #include "arm_compute/core/Types.h"
-// #include "openvino/core/type/element_type.hpp"
+
 namespace ov {
 namespace intel_cpu {
 
@@ -109,5 +109,13 @@ inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc)
     return arm_compute::DataLayout::UNKNOWN;
 }
 
+/**
+* @brief run thread-safe configure for ComputeLibrary configuration function.
+* Arm Compute Library 23.08 does not officially support thread-safe configure() calls.
+* For example, calling configure for Eltwise operations from multiple streams leads to a data race and seg fault.
+* @param config ComputeLibrary configuration function
+*/
+void configureThreadSafe(const std::function<void(void)>& config);
+
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
@@ -25,11 +25,6 @@
 
 #include "cpu/x64/cpu_isa_traits.hpp"
 
-#if defined(OV_CPU_WITH_ACL)
-#    include "arm_compute/runtime/CPP/CPPScheduler.h"
-#    include "nodes/executors/acl/acl_ie_scheduler.hpp"
-#endif
-
 using namespace ov::threading;
 
 namespace ov {
@@ -127,46 +122,12 @@ class CPUSpecialSetup {
 };
 #endif  // __linux__
 
-#if defined(OV_CPU_WITH_ACL)
-std::mutex Plugin::SchedulerGuard::mutex;
-std::weak_ptr<Plugin::SchedulerGuard> Plugin::SchedulerGuard::ptr;
-
-Plugin::SchedulerGuard::SchedulerGuard() {
-#    if OV_THREAD == OV_THREAD_SEQ
-    // To save state for ACL cores in single-thread mode
-    arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST);
-#    else
-    arm_compute::Scheduler::set(std::make_shared<ACLScheduler>());
-#    endif
-}
-
-std::shared_ptr<Plugin::SchedulerGuard> Plugin::SchedulerGuard::instance() {
-    std::lock_guard<std::mutex> lock{SchedulerGuard::mutex};
-    auto scheduler_guard_ptr = SchedulerGuard::ptr.lock();
-    if (scheduler_guard_ptr == nullptr) {
-        SchedulerGuard::ptr = scheduler_guard_ptr = std::make_shared<SchedulerGuard>();
-    }
-    return scheduler_guard_ptr;
-}
-
-Plugin::SchedulerGuard::~SchedulerGuard() {
-    // To save the state of scheduler after ACLScheduler has been executed
-    // TODO: find out the cause of the state
-    std::lock_guard<std::mutex> lock{this->dest_mutex};
-    if (!arm_compute::Scheduler::is_available(arm_compute::Scheduler::Type::CUSTOM))
-        arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST);
-}
-#endif
-
 Plugin::Plugin() : deviceFullName(getDeviceFullName()), specialSetup(new CPUSpecialSetup) {
     set_device_name("CPU");
     // Initialize Xbyak::util::Cpu object on Pcore for hybrid cores machine
     get_executor_manager()->execute_task_by_streams_executor(IStreamsExecutor::Config::PreferredCoreType::BIG, [] {
         dnnl::impl::cpu::x64::cpu();
     });
-#if defined(OV_CPU_WITH_ACL)
-    scheduler_guard = SchedulerGuard::instance();
-#endif
     auto& ov_version = ov::get_openvino_version();
     m_compiled_model_runtime_properties["OV_VERSION"] = std::string(ov_version.buildNumber);
 }

diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h
@@ -57,20 +57,6 @@ class Plugin : public ov::IPlugin {
     ov::AnyMap m_compiled_model_runtime_properties;
 
     std::shared_ptr<void> specialSetup;
-
-#if defined(OV_CPU_WITH_ACL)
-    struct SchedulerGuard {
-        SchedulerGuard();
-        ~SchedulerGuard();
-        static std::shared_ptr<SchedulerGuard> instance();
-        static std::mutex mutex;
-        // separate mutex for saving ACLScheduler state in destructor
-        mutable std::mutex dest_mutex;
-        static std::weak_ptr<SchedulerGuard> ptr;
-    };
-
-    std::shared_ptr<SchedulerGuard> scheduler_guard;
-#endif
 };
 
 }  // namespace intel_cpu

diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -300,19 +300,6 @@ std::vector<std::string> disabledTestPatterns() {
     {
         retVector.emplace_back(
             R"(smoke_CompareWithRefs_static_check_collapsing/EltwiseLayerTest.Inference/IS.*_eltwise_op_type=Div_secondary_input_type=PARAMETER_opType=VECTOR_model_type=i32_InType=undefined_OutType=undefined_trgDev=CPU.*)");
-        // TODO: enable once streams / tput mode is supported
-        retVector.emplace_back(
-            R"(OVClassConfigTestCPU.smoke_CpuExecNetworkCheck(Model|Core)StreamsHasHigherPriorityThanLatencyHint.*)");
-        retVector.emplace_back(
-            R"(smoke_BehaviorTests/CorrectConfigCheck.canSetConfigAndCheckGetConfig.*CPU_THROUGHPUT_STREAMS=8.*)");
-        retVector.emplace_back(
-            R"(smoke_BehaviorTests/CorrectConfigCheck.canSetConfigTwiceAndCheckGetConfig.*CPU_THROUGHPUT_STREAMS=8.*)");
-        retVector.emplace_back(
-            R"(smoke_CPU_OVClassLoadNetworkAndCheckWithSecondaryPropertiesTest/OVClassLoadNetworkAndCheckSecondaryPropertiesTest.LoadNetworkAndCheckSecondaryPropertiesTest.*)");
-        retVector.emplace_back(
-            R"(smoke_CPU_OVClassLoadNetworkAndCheckWithSecondaryPropertiesDoubleTest/OVClassLoadNetworkAndCheckSecondaryPropertiesTest.LoadNetworkAndCheckSecondaryPropertiesTest.*)");
-        retVector.emplace_back(R"(smoke_CPU_OVClassCompileModelAndCheckSecondaryPropertiesTest.*)");
-        retVector.emplace_back(R"(smoke_CPU_OVClassCompileModelAndCheckWithSecondaryPropertiesDoubleTest.*)");
         // Issue: 123321
         retVector.emplace_back(
             R"(.*smoke_RNNSequenceCommonZeroClip/RNNSequenceTest.Inference.*hidden_size=1.*relu.*direction=reverse.*)");
-Original file line number
+Diff line change
@@ Expand Up @@
                 };
             }
         }
-        ifunc = exec_func();
+        configureThreadSafe([&] { ifunc = exec_func(); });
         return true;
     }
@@ Expand Down @@