Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,4 @@ steps:
build.message =~ /\[only benchmarks\]/ ||
build.message !~ /\[only/ && !build.pull_request.draft &&
build.message !~ /\[skip benchmarks\]/
timeout_in_minutes: 45
timeout_in_minutes: 30
5 changes: 4 additions & 1 deletion perf/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
# group["1d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_vec)
# group["2d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_mat; dims=1)
# end
group["construct"] = @benchmarkable MtlArray{Int,1}(undef, 1)

# 'evals=1' added to prevent hang when running benchmarks of CI
# TODO: Investigate cause and properly fix.
group["construct"] = @benchmarkable MtlArray{Int,1}(undef, 1) evals=1

group["broadcast"] = @benchmarkable Metal.@sync $gpu_mat .= 0f0

Expand Down
2 changes: 1 addition & 1 deletion src/Metal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module Metal
using GPUArrays
using Adapt
using GPUCompiler
using GPUToolbox: SimpleVersion, @sv_str
using GPUToolbox
using LLVM
using LLVM.Interop
import LLVMDowngrader_jll
Expand Down
35 changes: 35 additions & 0 deletions src/mapreduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,34 @@ function partial_mapreduce_device(f, op, neutral, maxthreads, ::Val{Rreduce},
return
end

function big_mapreduce_kernel(f, op, neutral, ::Val{Rreduce}, ::Val{Rother}, R, As) where {Rreduce, Rother}
grid_idx = thread_position_in_threadgroup_1d() + (threadgroup_position_in_grid_1d() - 1u32) * threadgroups_per_grid_1d()

@inbounds if grid_idx <= length(Rother)
Iother = Rother[grid_idx]

# load the neutral value
neutral = if neutral === nothing
R[Iother]
else
neutral
end

val = op(neutral, neutral)

Ibegin = Rreduce[1]
for Ireduce in Rreduce
val = op(val, f(As[Iother + Ireduce - Ibegin]))
end
R[Iother] = val
end
return
end

## COV_EXCL_STOP

_big_mapreduce_threshold(dev) = dev.maxThreadsPerThreadgroup.width * num_gpu_cores()

function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
A::Union{AbstractArray,Broadcast.Broadcasted};
init=nothing) where {F, OP, T}
Expand All @@ -165,6 +191,15 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# NOTE: we hard-code `OneTo` (`first.(axes(A))` would work too) or we get a
# CartesianIndices object with UnitRanges that behave badly on the GPU.
@assert length(Rall) == length(Rother) * length(Rreduce)
@assert length(Rother) > 0

# If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
if length(Rother) >= _big_mapreduce_threshold(device(R))
threads = min(length(Rreduce), 512)
groups = cld(length(Rother), threads)
kernel = @metal threads groups big_mapreduce_kernel(f, op, init, Val(Rreduce), Val(Rother), R, A)
return R
end

# when the reduction dimension is contiguous in memory, we can improve performance
# by having each thread read multiple consecutive elements. base on experiments,
Expand Down
Loading