Skip to content

Add cpu_thread setting logic to xnn_executor_runner #8902

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 23 additions & 13 deletions backends/xnnpack/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ if(NOT PYTHON_EXECUTABLE)
resolve_python_executable()
endif()

# NB: Enabling this will serialize execution of delegate instances
# Keeping this OFF by default to maintain existing behavior, to be revisited.
# NB: Enabling this will serialize execution of delegate instances Keeping this
# OFF by default to maintain existing behavior, to be revisited.
option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
"Enable workspace sharing across different delegate instances" ON)
# Keeping this OFF by default due to regressions in decode
# and model load with kleidi kernels
option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI
"Enable Arm Kleidi kernels" OFF)
"Enable workspace sharing across different delegate instances" ON
)
# Keeping this OFF by default due to regressions in decode and model load with
# kleidi kernels
option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)
if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
endif()
Expand Down Expand Up @@ -100,8 +100,7 @@ include(cmake/Dependencies.cmake)
list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/")
add_library(xnnpack_backend STATIC ${_xnnpack_backend__srcs})
target_link_libraries(
xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_core
xnnpack_schema
xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_core xnnpack_schema
)

target_include_directories(
Expand All @@ -119,6 +118,12 @@ target_include_directories(
target_compile_options(xnnpack_backend PUBLIC ${_common_compile_options})
target_link_options_shared_lib(xnnpack_backend)

if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
list(APPEND xnn_executor_runner_libs optimized_native_cpu_ops_lib)
else()
list(APPEND xnn_executor_runner_libs portable_ops_lib)
endif()

list(APPEND xnn_executor_runner_libs xnnpack_backend executorch)

# ios can only build library but not binary
Expand All @@ -134,14 +139,19 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
if(EXECUTORCH_BUILD_DEVTOOLS)
list(APPEND xnn_executor_runner_libs etdump)
else()
message(SEND_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
message(
SEND_ERROR
"Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled."
)
endif()
endif()

target_link_libraries(
xnn_executor_runner gflags portable_ops_lib ${xnn_executor_runner_libs}
)
target_link_libraries(xnn_executor_runner gflags ${xnn_executor_runner_libs})
target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options})
if(EXECUTORCH_BUILD_PTHREADPOOL)
target_link_libraries(xnn_executor_runner extension_threadpool pthreadpool)
target_compile_definitions(xnn_executor_runner PRIVATE ET_USE_THREADPOOL)
endif()
endif()

install(
Expand Down
21 changes: 21 additions & 0 deletions examples/portable/executor_runner/executor_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
#include <executorch/devtools/etdump/etdump_flatcc.h>
#endif // ET_EVENT_TRACER_ENABLED

#if defined(ET_USE_THREADPOOL)
#include <executorch/extension/threadpool/cpuinfo_utils.h>
#include <executorch/extension/threadpool/threadpool.h>
#endif

static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB

static uint8_t temp_allocator_pool[1024U * 1024U];
Expand All @@ -47,6 +52,10 @@ DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
#ifdef ET_EVENT_TRACER_ENABLED
DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
#endif // ET_EVENT_TRACER_ENABLED
DEFINE_int32(
cpu_threads,
-1,
"Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");

using executorch::extension::FileDataLoader;
using executorch::runtime::Error;
Expand Down Expand Up @@ -124,6 +133,18 @@ int main(int argc, char** argv) {
return 1;
}

auto cpu_threads = FLAGS_cpu_threads;
#if defined(ET_USE_THREADPOOL)
uint32_t num_performant_cores = cpu_threads == -1
? ::executorch::extension::cpuinfo::get_num_performant_cores()
: static_cast<uint32_t>(cpu_threads);
ET_LOG(
Info, "Resetting threadpool with num threads = %d", num_performant_cores);
if (num_performant_cores > 0) {
::executorch::extension::threadpool::get_threadpool()
->_unsafe_reset_threadpool(num_performant_cores);
}
#endif // ET_USE_THREADPOOL
// Create a loader to get the data of the program file. There are other
// DataLoaders that use mmap() or point to data that's already in memory, and
// users can create their own DataLoaders to load from arbitrary sources.
Expand Down
Loading