Proof-of-concept: parallelize argmin (#9066)

swolchok · web-flow · commit 243c8856348e · 2025-03-12T16:17:21.000-07:00
I attempted to port `at::parallel_reduce` to ExecuTorch and use that
in reduce_util.h, but it turned out to be much trickier than expected.

(In brief: parallel reduction requires two steps: 1) split the input
range into chunks and reduce over them (easily done like
parallel_for), and then 2) combine the sub-results from chunks. The
reduction function accepted by reduce_over_dim is not well-suited to
step (2).)

Instead, I ported the parallelization strategy used by
binary_kernel_reduce_lastdim: just parallelize over the *non*-reduced
dimensions of the tensor. I don't understand why this strategy isn't
generally applicable and we aren't otherwise capable of parallelizing
reductions, so I haven't gated it to the case where we are reducing
over a contiguous last dimension.

I will send a follow-up that packages up this strategy nicely and uses
it in our reduction portable ops.
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace torch {
@@ -47,30 +48,43 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (const auto out_ix : c10::irange(out.numel())) {
-      std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-          [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            // the below condition as written is equivalent to !isnan(accval) &&
-            // (isnan(v) || v < acc_val). cases:
-            // - if neither acc_val nor v is NaN, !(v >= acc_val) is
-            //   trivially equivalent to v < acc_val.
-            // - if acc_val is NaN, the whole thing is trivially false.
-            // - if acc_val is not NaN and v is NaN, then v >= acc_val
-            // - is false because all comparisons involving NaN are
-            // - false, so the result is true. The result is trivially
-            // - true for the above condition that uses isnan(v) as
-            // - well.
-            if (!std::isnan(acc_val) && !(v >= acc_val)) {
-              acc_val = v;
-              acc_ix = ix;
-            }
-            return std::tuple<CTYPE, long>{acc_val, acc_ix};
-          },
-          in,
-          dim,
-          out_ix);
-      out_data[out_ix] = std::get<1>(acc);
-    }
+    // REVIEW: this is the parallelization strategy ATen uses
+    // specifically when the reduction is along the last dimension and
+    // that dimension is contiguous. Is there any particular reason we
+    // shouldn't just always use this strategy since we aren't
+    // otherwise capable of parallelizing reductions?
+    const int64_t reduction_size = get_reduced_dim_product(in, dim);
+    const auto grain_size = std::max(
+        static_cast<int64_t>(1),
+        executorch::extension::internal::GRAIN_SIZE / reduction_size);
+    const bool success = executorch::extension::parallel_for(
+        0, out.numel(), grain_size, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                  // the below condition as written is equivalent to
+                  // !isnan(accval) && (isnan(v) || v < acc_val). cases:
+                  // - if neither acc_val nor v is NaN, !(v >= acc_val) is
+                  //   trivially equivalent to v < acc_val.
+                  // - if acc_val is NaN, the whole thing is trivially false.
+                  // - if acc_val is not NaN and v is NaN, then v >= acc_val
+                  // - is false because all comparisons involving NaN are
+                  // - false, so the result is true. The result is trivially
+                  // - true for the above condition that uses isnan(v) as
+                  // - well.
+                  if (!std::isnan(acc_val) && !(v >= acc_val)) {
+                    acc_val = v;
+                    acc_ix = ix;
+                  }
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                },
+                in,
+                dim,
+                out_ix);
+            out_data[out_ix] = std::get<1>(acc);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -284,6 +284,7 @@ ATEN_OPS = (
         name = "op_argmin",
         deps = [
             "//executorch/kernels/portable/cpu/util:reduce_util",
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
     ),
     op_target(