-
Notifications
You must be signed in to change notification settings - Fork 51
[Do not merge] Test KernelIntrinsics
#688
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
christiangnrd
wants to merge
9
commits into
main
Choose a base branch
from
kaintr
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
kern = mtlfunction(f, tt; name, kwargs...)
- KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+ return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
end
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
KI.check_launch_args(numworkgroups, workgroupsize)
- obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+ return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
end
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
- Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+ return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
end
function KI.max_work_group_size(::MetalBackend)::Int
- Int(device().maxThreadsPerThreadgroup.width)
+ return Int(device().maxThreadsPerThreadgroup.width)
end
function KI.multiprocessor_count(::MetalBackend)::Int
- Metal.num_gpu_cores()
+ return Metal.num_gpu_cores()
end
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 72ced3ed..e90f5826 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
## COV_EXCL_START
function broadcast_cartesian_static(dest, bc, Is)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
I = @inbounds Is[i]
@inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
(isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
## COV_EXCL_START
function broadcast_linear(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
@inbounds dest[i] = bc[i]
i += stride
@@ -150,8 +150,8 @@ end
else
## COV_EXCL_START
function broadcast_cartesian(dest, bc)
- i = KI.get_global_id().x
- stride = KI.get_global_size().x
+ i = KI.get_global_id().x
+ stride = KI.get_global_size().x
while 1 <= i <= length(dest)
I = @inbounds CartesianIndices(dest)[i]
@inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
@inbounds global_random_counters()[simdgroupId]
elseif field === :ctr2
globalId = KI.get_global_id().x +
- (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
- (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+ (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+ (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
globalId % UInt32
end::UInt32
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we might not be able to launch all those threads to reduce each slice in one go.
# that's why each threads also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single item group.
- kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+ kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+ f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
# how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
# we can cover the dimensions to reduce using a single group
kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
- workgroupsize = threads, numworkgroups = groups)
+ workgroupsize = threads, numworkgroups = groups
+ )
else
# temporary empty array whose type will match the final partial array
partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
partial_kernel(f, op, init, Val(threads), Val(Rreduce),
Val(Rother), Val(UInt64(length(Rother))),
Val(grain), Val(shuffle), partial, A;
- numworkgroups = partial_groups, workgroupsize = partial_threads)
+ numworkgroups = partial_groups, workgroupsize = partial_threads
+ )
GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index 221ee680..6f9d5a2c 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(()->MetalBackend(), "Metal", Metal, MtlArray, Metal.MtlDevic
"Convert", # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
"SpecialFunctions", # no equivalent Metal intrinsics for gamma, erf, etc
"sparse", # not supported yet
- "CPU synchronization",
- "fallback test: callable types",
+ "CPU synchronization",
+ "fallback test: callable types",
]))
diff --git a/test/runtests.jl b/test/runtests.jl
index bcf20b51..7336f956 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
@static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
using Pkg
- Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+ Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
end
using Metal |
9ac3d49 to
6314372
Compare
Contributor
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Metal Benchmarks
Details
| Benchmark suite | Current: 415079d | Previous: 7268b55 | Ratio |
|---|---|---|---|
latency/precompile |
31373470583.5 ns |
25030416375 ns |
1.25 |
latency/ttfp |
2671095208 ns |
2265089000 ns |
1.18 |
latency/import |
1722187041.5 ns |
1442167667 ns |
1.19 |
integration/metaldevrt |
505542 ns |
829896 ns |
0.61 |
integration/byval/slices=1 |
1176542 ns |
1533000 ns |
0.77 |
integration/byval/slices=3 |
7285562.5 ns |
9201292 ns |
0.79 |
integration/byval/reference |
1174709 ns |
1523416 ns |
0.77 |
integration/byval/slices=2 |
2103062.5 ns |
2590021 ns |
0.81 |
kernel/indexing |
251209 ns |
582500 ns |
0.43 |
kernel/indexing_checked |
264417 ns |
607083 ns |
0.44 |
kernel/launch |
14625 ns |
11375 ns |
1.29 |
kernel/rand |
287291 ns |
553583 ns |
0.52 |
array/construct |
7125 ns |
6125 ns |
1.16 |
array/broadcast |
274667 ns |
585917 ns |
0.47 |
array/random/randn/Float32 |
398583 ns |
777417 ns |
0.51 |
array/random/randn!/Float32 |
320708 ns |
618958 ns |
0.52 |
array/random/rand!/Int64 |
321500 ns |
548292 ns |
0.59 |
array/random/rand!/Float32 |
282041 ns |
577292 ns |
0.49 |
array/random/rand/Int64 |
492625 ns |
783750 ns |
0.63 |
array/random/rand/Float32 |
358875 ns |
581834 ns |
0.62 |
array/accumulate/Int64/1d |
994583 ns |
1253875 ns |
0.79 |
array/accumulate/Int64/dims=1 |
1184125 ns |
1796167 ns |
0.66 |
array/accumulate/Int64/dims=2 |
1499959 ns |
2114375.5 ns |
0.71 |
array/accumulate/Int64/dims=1L |
9720083 ns |
11663125 ns |
0.83 |
array/accumulate/Int64/dims=2L |
7793771 ns |
9701167 ns |
0.80 |
array/accumulate/Float32/1d |
820709 ns |
1102646 ns |
0.74 |
array/accumulate/Float32/dims=1 |
1052583 ns |
1531229.5 ns |
0.69 |
array/accumulate/Float32/dims=2 |
1367020.5 ns |
1843583 ns |
0.74 |
array/accumulate/Float32/dims=1L |
8715229 ns |
9782249.5 ns |
0.89 |
array/accumulate/Float32/dims=2L |
4693125 ns |
7210459 ns |
0.65 |
array/reductions/reduce/Int64/1d |
708083 ns |
1510083.5 ns |
0.47 |
array/reductions/reduce/Int64/dims=1 |
658667 ns |
1069500 ns |
0.62 |
array/reductions/reduce/Int64/dims=2 |
903333 ns |
1115875 ns |
0.81 |
array/reductions/reduce/Int64/dims=1L |
1246666.5 ns |
2004458 ns |
0.62 |
array/reductions/reduce/Int64/dims=2L |
2299854.5 ns |
4227000 ns |
0.54 |
array/reductions/reduce/Float32/1d |
523000 ns |
992042 ns |
0.53 |
array/reductions/reduce/Float32/dims=1 |
394083 ns |
802625 ns |
0.49 |
array/reductions/reduce/Float32/dims=2 |
444417 ns |
827750 ns |
0.54 |
array/reductions/reduce/Float32/dims=1L |
710875 ns |
1302958 ns |
0.55 |
array/reductions/reduce/Float32/dims=2L |
1317000 ns |
1808833.5 ns |
0.73 |
array/reductions/mapreduce/Int64/1d |
712208 ns |
1533333.5 ns |
0.46 |
array/reductions/mapreduce/Int64/dims=1 |
658875 ns |
1069542 ns |
0.62 |
array/reductions/mapreduce/Int64/dims=2 |
902375 ns |
1201645.5 ns |
0.75 |
array/reductions/mapreduce/Int64/dims=1L |
1179021 ns |
1994459 ns |
0.59 |
array/reductions/mapreduce/Int64/dims=2L |
2297625 ns |
3609417 ns |
0.64 |
array/reductions/mapreduce/Float32/1d |
520167 ns |
1017625 ns |
0.51 |
array/reductions/mapreduce/Float32/dims=1 |
395000 ns |
797250 ns |
0.50 |
array/reductions/mapreduce/Float32/dims=2 |
444959 ns |
797770.5 ns |
0.56 |
array/reductions/mapreduce/Float32/dims=1L |
750729 ns |
1305708 ns |
0.57 |
array/reductions/mapreduce/Float32/dims=2L |
1285542 ns |
1794125 ns |
0.72 |
array/private/copyto!/gpu_to_gpu |
238041 ns |
629917 ns |
0.38 |
array/private/copyto!/cpu_to_gpu |
261667 ns |
786125 ns |
0.33 |
array/private/copyto!/gpu_to_cpu |
263542 ns |
775125 ns |
0.34 |
array/private/iteration/findall/int |
1243084 ns |
1538333 ns |
0.81 |
array/private/iteration/findall/bool |
1102500 ns |
1423333 ns |
0.77 |
array/private/iteration/findfirst/int |
1193375 ns |
2029791 ns |
0.59 |
array/private/iteration/findfirst/bool |
1192333 ns |
2010458 ns |
0.59 |
array/private/iteration/scalar |
1787458 ns |
4660542 ns |
0.38 |
array/private/iteration/logical |
1642687.5 ns |
2547167 ns |
0.64 |
array/private/iteration/findmin/1d |
1423584 ns |
2461875 ns |
0.58 |
array/private/iteration/findmin/2d |
1209375 ns |
1788250 ns |
0.68 |
array/private/copy |
346000 ns |
597979 ns |
0.58 |
array/shared/copyto!/gpu_to_gpu |
86292 ns |
82583 ns |
1.04 |
array/shared/copyto!/cpu_to_gpu |
86959 ns |
82166 ns |
1.06 |
array/shared/copyto!/gpu_to_cpu |
83541 ns |
83125 ns |
1.01 |
array/shared/iteration/findall/int |
1241833 ns |
1559792 ns |
0.80 |
array/shared/iteration/findall/bool |
1111375 ns |
1436625 ns |
0.77 |
array/shared/iteration/findfirst/int |
1004833 ns |
1621791 ns |
0.62 |
array/shared/iteration/findfirst/bool |
1003625 ns |
1614209 ns |
0.62 |
array/shared/iteration/scalar |
209583 ns |
207042 ns |
1.01 |
array/shared/iteration/logical |
1487250 ns |
2451874.5 ns |
0.61 |
array/shared/iteration/findmin/1d |
1238917 ns |
2077833 ns |
0.60 |
array/shared/iteration/findmin/2d |
1207584 ns |
1775937.5 ns |
0.68 |
array/shared/copy |
350292 ns |
237375 ns |
1.48 |
array/permutedims/4d |
1774124.5 ns |
2358167 ns |
0.75 |
array/permutedims/2d |
545917 ns |
1150292 ns |
0.47 |
array/permutedims/3d |
1146041 ns |
1643708 ns |
0.70 |
metal/synchronization/stream |
18625 ns |
19500 ns |
0.96 |
metal/synchronization/context |
18875 ns |
20166 ns |
0.94 |
This comment was automatically generated by workflow using github-action-benchmark.
22e754e to
68db9c2
Compare
2b8dce1 to
0e76668
Compare
db9a7dc to
c802ccc
Compare
4b8f026 to
ce67b4c
Compare
ce67b4c to
03bb0dd
Compare
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Not a draft to also run benchmarks