Skip to content

Conversation

@christiangnrd
Copy link
Member

Not a draft to also run benchmarks

@github-actions
Copy link
Contributor

github-actions bot commented Oct 22, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.
diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
 
 KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
 
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = mtlfunction(f, tt; name, kwargs...)
-    KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+    return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
 end
 
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+    return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
 end
 
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::MetalBackend)::Int
-    Int(device().maxThreadsPerThreadgroup.width)
+    return Int(device().maxThreadsPerThreadgroup.width)
 end
 function KI.multiprocessor_count(::MetalBackend)::Int
-    Metal.num_gpu_cores()
+    return Metal.num_gpu_cores()
 end
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 72ced3ed..e90f5826 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                  @inbounds dest[i] = bc[i]
                  i += stride
@@ -150,8 +150,8 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+        f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               workgroupsize = threads, numworkgroups = groups)
+            workgroupsize = threads, numworkgroups = groups
+        )
     else
         # temporary empty array whose type will match the final partial array
 	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         partial_kernel(f, op, init, Val(threads), Val(Rreduce),
                         Val(Rother), Val(UInt64(length(Rother))),
                         Val(grain), Val(shuffle), partial, A;
-                        numworkgroups = partial_groups, workgroupsize = partial_threads)
+            numworkgroups = partial_groups, workgroupsize = partial_threads
+        )
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index 221ee680..6f9d5a2c 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(()->MetalBackend(), "Metal", Metal, MtlArray, Metal.MtlDevic
     "Convert",           # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
     "SpecialFunctions",  # no equivalent Metal intrinsics for gamma, erf, etc
     "sparse",            # not supported yet
-    "CPU synchronization",
-    "fallback test: callable types",
+            "CPU synchronization",
+            "fallback test: callable types",
 ]))
diff --git a/test/runtests.jl b/test/runtests.jl
index bcf20b51..7336f956 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Metal

@christiangnrd christiangnrd force-pushed the kaintr branch 3 times, most recently from 9ac3d49 to 6314372 Compare October 22, 2025 04:31
Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metal Benchmarks

Details
Benchmark suite Current: 415079d Previous: 7268b55 Ratio
latency/precompile 31373470583.5 ns 25030416375 ns 1.25
latency/ttfp 2671095208 ns 2265089000 ns 1.18
latency/import 1722187041.5 ns 1442167667 ns 1.19
integration/metaldevrt 505542 ns 829896 ns 0.61
integration/byval/slices=1 1176542 ns 1533000 ns 0.77
integration/byval/slices=3 7285562.5 ns 9201292 ns 0.79
integration/byval/reference 1174709 ns 1523416 ns 0.77
integration/byval/slices=2 2103062.5 ns 2590021 ns 0.81
kernel/indexing 251209 ns 582500 ns 0.43
kernel/indexing_checked 264417 ns 607083 ns 0.44
kernel/launch 14625 ns 11375 ns 1.29
kernel/rand 287291 ns 553583 ns 0.52
array/construct 7125 ns 6125 ns 1.16
array/broadcast 274667 ns 585917 ns 0.47
array/random/randn/Float32 398583 ns 777417 ns 0.51
array/random/randn!/Float32 320708 ns 618958 ns 0.52
array/random/rand!/Int64 321500 ns 548292 ns 0.59
array/random/rand!/Float32 282041 ns 577292 ns 0.49
array/random/rand/Int64 492625 ns 783750 ns 0.63
array/random/rand/Float32 358875 ns 581834 ns 0.62
array/accumulate/Int64/1d 994583 ns 1253875 ns 0.79
array/accumulate/Int64/dims=1 1184125 ns 1796167 ns 0.66
array/accumulate/Int64/dims=2 1499959 ns 2114375.5 ns 0.71
array/accumulate/Int64/dims=1L 9720083 ns 11663125 ns 0.83
array/accumulate/Int64/dims=2L 7793771 ns 9701167 ns 0.80
array/accumulate/Float32/1d 820709 ns 1102646 ns 0.74
array/accumulate/Float32/dims=1 1052583 ns 1531229.5 ns 0.69
array/accumulate/Float32/dims=2 1367020.5 ns 1843583 ns 0.74
array/accumulate/Float32/dims=1L 8715229 ns 9782249.5 ns 0.89
array/accumulate/Float32/dims=2L 4693125 ns 7210459 ns 0.65
array/reductions/reduce/Int64/1d 708083 ns 1510083.5 ns 0.47
array/reductions/reduce/Int64/dims=1 658667 ns 1069500 ns 0.62
array/reductions/reduce/Int64/dims=2 903333 ns 1115875 ns 0.81
array/reductions/reduce/Int64/dims=1L 1246666.5 ns 2004458 ns 0.62
array/reductions/reduce/Int64/dims=2L 2299854.5 ns 4227000 ns 0.54
array/reductions/reduce/Float32/1d 523000 ns 992042 ns 0.53
array/reductions/reduce/Float32/dims=1 394083 ns 802625 ns 0.49
array/reductions/reduce/Float32/dims=2 444417 ns 827750 ns 0.54
array/reductions/reduce/Float32/dims=1L 710875 ns 1302958 ns 0.55
array/reductions/reduce/Float32/dims=2L 1317000 ns 1808833.5 ns 0.73
array/reductions/mapreduce/Int64/1d 712208 ns 1533333.5 ns 0.46
array/reductions/mapreduce/Int64/dims=1 658875 ns 1069542 ns 0.62
array/reductions/mapreduce/Int64/dims=2 902375 ns 1201645.5 ns 0.75
array/reductions/mapreduce/Int64/dims=1L 1179021 ns 1994459 ns 0.59
array/reductions/mapreduce/Int64/dims=2L 2297625 ns 3609417 ns 0.64
array/reductions/mapreduce/Float32/1d 520167 ns 1017625 ns 0.51
array/reductions/mapreduce/Float32/dims=1 395000 ns 797250 ns 0.50
array/reductions/mapreduce/Float32/dims=2 444959 ns 797770.5 ns 0.56
array/reductions/mapreduce/Float32/dims=1L 750729 ns 1305708 ns 0.57
array/reductions/mapreduce/Float32/dims=2L 1285542 ns 1794125 ns 0.72
array/private/copyto!/gpu_to_gpu 238041 ns 629917 ns 0.38
array/private/copyto!/cpu_to_gpu 261667 ns 786125 ns 0.33
array/private/copyto!/gpu_to_cpu 263542 ns 775125 ns 0.34
array/private/iteration/findall/int 1243084 ns 1538333 ns 0.81
array/private/iteration/findall/bool 1102500 ns 1423333 ns 0.77
array/private/iteration/findfirst/int 1193375 ns 2029791 ns 0.59
array/private/iteration/findfirst/bool 1192333 ns 2010458 ns 0.59
array/private/iteration/scalar 1787458 ns 4660542 ns 0.38
array/private/iteration/logical 1642687.5 ns 2547167 ns 0.64
array/private/iteration/findmin/1d 1423584 ns 2461875 ns 0.58
array/private/iteration/findmin/2d 1209375 ns 1788250 ns 0.68
array/private/copy 346000 ns 597979 ns 0.58
array/shared/copyto!/gpu_to_gpu 86292 ns 82583 ns 1.04
array/shared/copyto!/cpu_to_gpu 86959 ns 82166 ns 1.06
array/shared/copyto!/gpu_to_cpu 83541 ns 83125 ns 1.01
array/shared/iteration/findall/int 1241833 ns 1559792 ns 0.80
array/shared/iteration/findall/bool 1111375 ns 1436625 ns 0.77
array/shared/iteration/findfirst/int 1004833 ns 1621791 ns 0.62
array/shared/iteration/findfirst/bool 1003625 ns 1614209 ns 0.62
array/shared/iteration/scalar 209583 ns 207042 ns 1.01
array/shared/iteration/logical 1487250 ns 2451874.5 ns 0.61
array/shared/iteration/findmin/1d 1238917 ns 2077833 ns 0.60
array/shared/iteration/findmin/2d 1207584 ns 1775937.5 ns 0.68
array/shared/copy 350292 ns 237375 ns 1.48
array/permutedims/4d 1774124.5 ns 2358167 ns 0.75
array/permutedims/2d 545917 ns 1150292 ns 0.47
array/permutedims/3d 1146041 ns 1643708 ns 0.70
metal/synchronization/stream 18625 ns 19500 ns 0.96
metal/synchronization/context 18875 ns 20166 ns 0.94

This comment was automatically generated by workflow using github-action-benchmark.

skip scripts tests on 1.10

Project.toml
This reverts commit f4dc915.
This reverts commit d6b12f4.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants