pytorch · swolchok · Mar 4, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
@@ -33,14 +33,14 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-# NB: Enabling this will serialize execution of delegate instances
-# Keeping this OFF by default to maintain existing behavior, to be revisited.
+# NB: Enabling this will serialize execution of delegate instances Keeping this
+# OFF by default to maintain existing behavior, to be revisited.
 option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
-  "Enable workspace sharing across different delegate instances" ON)
-# Keeping this OFF by default due to regressions in decode
-# and model load with kleidi kernels
-option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI
-  "Enable Arm Kleidi kernels" OFF)
+       "Enable workspace sharing across different delegate instances" ON
+)
+# Keeping this OFF by default due to regressions in decode and model load with
+# kleidi kernels
+option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
@@ -100,8 +100,7 @@ include(cmake/Dependencies.cmake)
 list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(xnnpack_backend STATIC ${_xnnpack_backend__srcs})
 target_link_libraries(
-  xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_core
-                          xnnpack_schema
+  xnnpack_backend PRIVATE ${xnnpack_third_party} executorch_core xnnpack_schema
 )
 
 target_include_directories(
@@ -119,6 +118,12 @@ target_include_directories(
 target_compile_options(xnnpack_backend PUBLIC ${_common_compile_options})
 target_link_options_shared_lib(xnnpack_backend)
 
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  list(APPEND xnn_executor_runner_libs optimized_native_cpu_ops_lib)
+else()
+  list(APPEND xnn_executor_runner_libs portable_ops_lib)
+endif()
+
 list(APPEND xnn_executor_runner_libs xnnpack_backend executorch)
 
 # ios can only build library but not binary
@@ -134,14 +139,19 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
     if(EXECUTORCH_BUILD_DEVTOOLS)
       list(APPEND xnn_executor_runner_libs etdump)
     else()
-      message(SEND_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
+      message(
+        SEND_ERROR
+          "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled."
+      )
     endif()
   endif()
 
-  target_link_libraries(
-    xnn_executor_runner gflags portable_ops_lib ${xnn_executor_runner_libs}
-  )
+  target_link_libraries(xnn_executor_runner gflags ${xnn_executor_runner_libs})
   target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options})
+  if(EXECUTORCH_BUILD_PTHREADPOOL)
+    target_link_libraries(xnn_executor_runner extension_threadpool pthreadpool)
+    target_compile_definitions(xnn_executor_runner PRIVATE ET_USE_THREADPOOL)
+  endif()
 endif()
 
 install(

@@ -35,6 +35,11 @@
 #include <executorch/devtools/etdump/etdump_flatcc.h>
 #endif // ET_EVENT_TRACER_ENABLED
 
+#if defined(ET_USE_THREADPOOL)
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
+#endif
+
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
 
 static uint8_t temp_allocator_pool[1024U * 1024U];
@@ -47,6 +52,10 @@ DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
 #ifdef ET_EVENT_TRACER_ENABLED
 DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
 #endif // ET_EVENT_TRACER_ENABLED
+DEFINE_int32(
+    cpu_threads,
+    -1,
+    "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
 using executorch::extension::FileDataLoader;
 using executorch::runtime::Error;
@@ -124,6 +133,18 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  auto cpu_threads = FLAGS_cpu_threads;
+#if defined(ET_USE_THREADPOOL)
+  uint32_t num_performant_cores = cpu_threads == -1
+      ? ::executorch::extension::cpuinfo::get_num_performant_cores()
+      : static_cast<uint32_t>(cpu_threads);
+  ET_LOG(
+      Info, "Resetting threadpool with num threads = %d", num_performant_cores);
+  if (num_performant_cores > 0) {
+    ::executorch::extension::threadpool::get_threadpool()
+        ->_unsafe_reset_threadpool(num_performant_cores);
+  }
+#endif // ET_USE_THREADPOOL
   // Create a loader to get the data of the program file. There are other
   // DataLoaders that use mmap() or point to data that's already in memory, and
   // users can create their own DataLoaders to load from arbitrary sources.