JuliaGPU · christiangnrd · Aug 1, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -151,4 +151,4 @@ steps:
       build.message =~ /\[only benchmarks\]/ ||
       build.message !~ /\[only/ && !build.pull_request.draft &&
         build.message !~ /\[skip benchmarks\]/
-    timeout_in_minutes: 45
+    timeout_in_minutes: 30
diff --git a/perf/array.jl b/perf/array.jl
@@ -65,7 +65,10 @@ gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
 #     group["1d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_vec)
 #     group["2d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_mat; dims=1)
 # end
-group["construct"] = @benchmarkable MtlArray{Int,1}(undef, 1)
+
+# 'evals=1' added to prevent hang when running benchmarks of CI
+# TODO: Investigate cause and properly fix.
+group["construct"] = @benchmarkable MtlArray{Int,1}(undef, 1) evals=1
 
 group["broadcast"] = @benchmarkable Metal.@sync $gpu_mat .= 0f0
 

diff --git a/src/Metal.jl b/src/Metal.jl
@@ -3,7 +3,7 @@ module Metal
 using GPUArrays
 using Adapt
 using GPUCompiler
-using GPUToolbox: SimpleVersion, @sv_str
+using GPUToolbox
 using LLVM
 using LLVM.Interop
 import LLVMDowngrader_jll

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -140,8 +140,34 @@ function partial_mapreduce_device(f, op, neutral, maxthreads, ::Val{Rreduce},
     return
 end
 
+function big_mapreduce_kernel(f, op, neutral, ::Val{Rreduce}, ::Val{Rother}, R, As) where {Rreduce, Rother}
+    grid_idx = thread_position_in_threadgroup_1d() + (threadgroup_position_in_grid_1d() - 1u32) * threadgroups_per_grid_1d()
+
+    @inbounds if grid_idx <= length(Rother)
+        Iother = Rother[grid_idx]
+
+        # load the neutral value
+        neutral = if neutral === nothing
+            R[Iother]
+        else
+            neutral
+        end
+
+        val = op(neutral, neutral)
+
+        Ibegin = Rreduce[1]
+        for Ireduce in Rreduce
+            val = op(val, f(As[Iother + Ireduce - Ibegin]))
+        end
+        R[Iother] = val
+    end
+    return
+end
+
 ## COV_EXCL_STOP
 
+_big_mapreduce_threshold(dev) = dev.maxThreadsPerThreadgroup.width * num_gpu_cores()
+
 function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
                                  A::Union{AbstractArray,Broadcast.Broadcasted};
                                  init=nothing) where {F, OP, T}
@@ -165,6 +191,15 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a
     #       CartesianIndices object with UnitRanges that behave badly on the GPU.
     @assert length(Rall) == length(Rother) * length(Rreduce)
+    @assert length(Rother) > 0
+
+    # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
+    if length(Rother) >= _big_mapreduce_threshold(device(R))
+        threads = min(length(Rreduce), 512)
+        groups = cld(length(Rother), threads)
+        kernel = @metal threads groups big_mapreduce_kernel(f, op, init, Val(Rreduce), Val(Rother), R, A)
+        return R
+    end
 
     # when the reduction dimension is contiguous in memory, we can improve performance
     # by having each thread read multiple consecutive elements. base on experiments,