parallelize optimized op_where using parallel_for (#9059)

swolchok · web-flow · commit c183ef05fa07 · 2025-03-12T12:18:07.000-07:00
Internal model got a 5.7% latency improvement (313.8 ms before, 296.0 ms
after).
diff --git a/kernels/optimized/cpu/op_where.cpp b/kernels/optimized/cpu/op_where.cpp
@@ -7,7 +7,7 @@
  */
 #include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <iostream>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 
 namespace torch {
 namespace executor {
@@ -58,15 +58,31 @@ Tensor& opt_where_out(
       const bool* const data_cond = cond.const_data_ptr<bool>();
       CTYPE_COMPUTE* const data_out = out.data_ptr<CTYPE_COMPUTE>();
       if (any_is_broadcasted) {
-        for (const auto [out_index, a_index, b_index, cond_index] :
-             BroadcastIndexesRange<3>(out, a, b, cond)) {
-          data_out[out_index] =
-              data_cond[cond_index] ? data_a[a_index] : data_b[b_index];
-        }
+        executorch::extension::parallel_for(
+            0,
+            out_numel,
+            ::executorch::extension::internal::GRAIN_SIZE,
+            [&](const auto begin, const auto end) {
+              auto range = BroadcastIndexesRange<3>(out, a, b, cond);
+              auto begin_it = range.begin();
+              begin_it += begin;
+              for (; (*begin_it)[0] < end; ++begin_it) {
+                const auto [out_index, a_index, b_index, cond_index] =
+                    *begin_it;
+                data_out[out_index] =
+                    data_cond[cond_index] ? data_a[a_index] : data_b[b_index];
+              }
+            });
       } else {
-        for (const auto i : c10::irange(out_numel)) {
-          data_out[i] = data_cond[i] ? data_a[i] : data_b[i];
-        }
+        executorch::extension::parallel_for(
+            0,
+            out_numel,
+            ::executorch::extension::internal::GRAIN_SIZE,
+            [&](const auto begin, const auto end) {
+              for (const auto i : c10::irange(begin, end)) {
+                data_out[i] = data_cond[i] ? data_a[i] : data_b[i];
+              }
+            });
       }
     });
   } else {
diff --git a/runtime/kernel/thread_parallel_interface.h b/runtime/kernel/thread_parallel_interface.h
@@ -44,6 +44,9 @@ inline bool parallel_for_no_threadpool(
   return true;
 }
 
+// Match GRAIN_SIZE from PyTorch core.
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78
+constexpr int64_t GRAIN_SIZE = 32768;
 } // namespace internal
 
 #ifdef ET_USE_THREADPOOL

Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,9 @@ inline bool parallel_for_no_threadpool(`
`44`	`44`	`return true;`
`45`	`45`	`}`
`46`	`46`
	`47`	`+// Match GRAIN_SIZE from PyTorch core.`
	`48`	`+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/TensorIterator.h#L78`
	`49`	`+constexpr int64_t GRAIN_SIZE = 32768;`
`47`	`50`	`} // namespace internal`
`48`	`51`
`49`	`52`	`#ifdef ET_USE_THREADPOOL`