[Do not merge] Test `KernelIntrinsics` #688

christiangnrd · 2025-10-22T03:48:25Z

Not a draft to also run benchmarks

github-actions · 2025-10-22T03:49:11Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.

diff --git a/src/MetalKernels.jl b/src/MetalKernels.jl
index 4e856194..7573c5e1 100644
--- a/src/MetalKernels.jl
+++ b/src/MetalKernels.jl
@@ -136,26 +136,26 @@ end
 
 KI.argconvert(::MetalBackend, arg) = mtlconvert(arg)
 
-function KI.kernel_function(::MetalBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+function KI.kernel_function(::MetalBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
     kern = mtlfunction(f, tt; name, kwargs...)
-    KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
+    return KI.Kernel{MetalBackend, typeof(kern)}(MetalBackend(), kern)
 end
 
-function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups=1, workgroupsize=1)
+function (obj::KI.Kernel{MetalBackend})(args...; numworkgroups = 1, workgroupsize = 1)
     KI.check_launch_args(numworkgroups, workgroupsize)
 
-    obj.kern(args...; threads=workgroupsize, groups=numworkgroups)
+    return obj.kern(args...; threads = workgroupsize, groups = numworkgroups)
 end
 
 
-function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:MetalBackend}; max_work_items::Int = typemax(Int))::Int
+    return Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::MetalBackend)::Int
-    Int(device().maxThreadsPerThreadgroup.width)
+    return Int(device().maxThreadsPerThreadgroup.width)
 end
 function KI.multiprocessor_count(::MetalBackend)::Int
-    Metal.num_gpu_cores()
+    return Metal.num_gpu_cores()
 end
 
 
diff --git a/src/broadcast.jl b/src/broadcast.jl
index 72ced3ed..e90f5826 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -66,8 +66,8 @@ end
     if _broadcast_shapes[Is] > BROADCAST_SPECIALIZATION_THRESHOLD
         ## COV_EXCL_START
         function broadcast_cartesian_static(dest, bc, Is)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds Is[i]
                 @inbounds dest[I] = bc[I]
@@ -91,8 +91,8 @@ end
        (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear))
         ## COV_EXCL_START
         function broadcast_linear(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                  @inbounds dest[i] = bc[i]
                  i += stride
@@ -150,8 +150,8 @@ end
     else
         ## COV_EXCL_START
         function broadcast_cartesian(dest, bc)
-             i = KI.get_global_id().x
-             stride = KI.get_global_size().x
+            i = KI.get_global_id().x
+            stride = KI.get_global_size().x
              while 1 <= i <= length(dest)
                 I = @inbounds CartesianIndices(dest)[i]
                 @inbounds dest[I] = bc[I]
diff --git a/src/device/random.jl b/src/device/random.jl
index 12b053a2..edc999cd 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -89,8 +89,8 @@ end
         @inbounds global_random_counters()[simdgroupId]
     elseif field === :ctr2
         globalId = KI.get_global_id().x +
-                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
-                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
+            (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+            (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId % UInt32
     end::UInt32
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 7be5ef43..a737e8d0 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -224,7 +224,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-    kernel = KI.@kernel backend launch = false partial_mapreduce_device(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
+    kernel = KI.@kernel backend launch = false partial_mapreduce_device(
+        f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                                                           Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A)
 
     # how many threads do we want?
@@ -260,7 +261,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         # we can cover the dimensions to reduce using a single group
         kernel(f, op, init, Val(maxthreads), Val(Rreduce), Val(Rother),
                Val(UInt64(length(Rother))), Val(grain), Val(shuffle), R, A;
-               workgroupsize = threads, numworkgroups = groups)
+            workgroupsize = threads, numworkgroups = groups
+        )
     else
         # temporary empty array whose type will match the final partial array
 	    partial = similar(R, ntuple(_ -> 0, Val(ndims(R)+1)))
@@ -287,7 +289,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T},
         partial_kernel(f, op, init, Val(threads), Val(Rreduce),
                         Val(Rother), Val(UInt64(length(Rother))),
                         Val(grain), Val(shuffle), partial, A;
-                        numworkgroups = partial_groups, workgroupsize = partial_threads)
+            numworkgroups = partial_groups, workgroupsize = partial_threads
+        )
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init=init)
     end
diff --git a/test/kernelabstractions.jl b/test/kernelabstractions.jl
index 221ee680..6f9d5a2c 100644
--- a/test/kernelabstractions.jl
+++ b/test/kernelabstractions.jl
@@ -7,6 +7,6 @@ Testsuite.testsuite(()->MetalBackend(), "Metal", Metal, MtlArray, Metal.MtlDevic
     "Convert",           # depends on https://github.com/JuliaGPU/Metal.jl/issues/69
     "SpecialFunctions",  # no equivalent Metal intrinsics for gamma, erf, etc
     "sparse",            # not supported yet
-    "CPU synchronization",
-    "fallback test: callable types",
+            "CPU synchronization",
+            "fallback test: callable types",
 ]))
diff --git a/test/runtests.jl b/test/runtests.jl
index bcf20b51..7336f956 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,6 @@
 @static if VERSION < v"1.11" && get(ENV, "BUILDKITE_PIPELINE_NAME", "Metal.jl") == "Metal.jl"
     using Pkg
-    Pkg.add(url="https://github.com/JuliaGPU/KernelAbstractions.jl", rev="main")
+    Pkg.add(url = "https://github.com/JuliaGPU/KernelAbstractions.jl", rev = "main")
 end
 
 using Metal

github-actions

Metal Benchmarks

Details

Benchmark suite	Current: `415079d`	Previous: `7268b55`	Ratio
`latency/precompile`	`31373470583.5` ns	`25030416375` ns	`1.25`
`latency/ttfp`	`2671095208` ns	`2265089000` ns	`1.18`
`latency/import`	`1722187041.5` ns	`1442167667` ns	`1.19`
`integration/metaldevrt`	`505542` ns	`829896` ns	`0.61`
`integration/byval/slices=1`	`1176542` ns	`1533000` ns	`0.77`
`integration/byval/slices=3`	`7285562.5` ns	`9201292` ns	`0.79`
`integration/byval/reference`	`1174709` ns	`1523416` ns	`0.77`
`integration/byval/slices=2`	`2103062.5` ns	`2590021` ns	`0.81`
`kernel/indexing`	`251209` ns	`582500` ns	`0.43`
`kernel/indexing_checked`	`264417` ns	`607083` ns	`0.44`
`kernel/launch`	`14625` ns	`11375` ns	`1.29`
`kernel/rand`	`287291` ns	`553583` ns	`0.52`
`array/construct`	`7125` ns	`6125` ns	`1.16`
`array/broadcast`	`274667` ns	`585917` ns	`0.47`
`array/random/randn/Float32`	`398583` ns	`777417` ns	`0.51`
`array/random/randn!/Float32`	`320708` ns	`618958` ns	`0.52`
`array/random/rand!/Int64`	`321500` ns	`548292` ns	`0.59`
`array/random/rand!/Float32`	`282041` ns	`577292` ns	`0.49`
`array/random/rand/Int64`	`492625` ns	`783750` ns	`0.63`
`array/random/rand/Float32`	`358875` ns	`581834` ns	`0.62`
`array/accumulate/Int64/1d`	`994583` ns	`1253875` ns	`0.79`
`array/accumulate/Int64/dims=1`	`1184125` ns	`1796167` ns	`0.66`
`array/accumulate/Int64/dims=2`	`1499959` ns	`2114375.5` ns	`0.71`
`array/accumulate/Int64/dims=1L`	`9720083` ns	`11663125` ns	`0.83`
`array/accumulate/Int64/dims=2L`	`7793771` ns	`9701167` ns	`0.80`
`array/accumulate/Float32/1d`	`820709` ns	`1102646` ns	`0.74`
`array/accumulate/Float32/dims=1`	`1052583` ns	`1531229.5` ns	`0.69`
`array/accumulate/Float32/dims=2`	`1367020.5` ns	`1843583` ns	`0.74`
`array/accumulate/Float32/dims=1L`	`8715229` ns	`9782249.5` ns	`0.89`
`array/accumulate/Float32/dims=2L`	`4693125` ns	`7210459` ns	`0.65`
`array/reductions/reduce/Int64/1d`	`708083` ns	`1510083.5` ns	`0.47`
`array/reductions/reduce/Int64/dims=1`	`658667` ns	`1069500` ns	`0.62`
`array/reductions/reduce/Int64/dims=2`	`903333` ns	`1115875` ns	`0.81`
`array/reductions/reduce/Int64/dims=1L`	`1246666.5` ns	`2004458` ns	`0.62`
`array/reductions/reduce/Int64/dims=2L`	`2299854.5` ns	`4227000` ns	`0.54`
`array/reductions/reduce/Float32/1d`	`523000` ns	`992042` ns	`0.53`
`array/reductions/reduce/Float32/dims=1`	`394083` ns	`802625` ns	`0.49`
`array/reductions/reduce/Float32/dims=2`	`444417` ns	`827750` ns	`0.54`
`array/reductions/reduce/Float32/dims=1L`	`710875` ns	`1302958` ns	`0.55`
`array/reductions/reduce/Float32/dims=2L`	`1317000` ns	`1808833.5` ns	`0.73`
`array/reductions/mapreduce/Int64/1d`	`712208` ns	`1533333.5` ns	`0.46`
`array/reductions/mapreduce/Int64/dims=1`	`658875` ns	`1069542` ns	`0.62`
`array/reductions/mapreduce/Int64/dims=2`	`902375` ns	`1201645.5` ns	`0.75`
`array/reductions/mapreduce/Int64/dims=1L`	`1179021` ns	`1994459` ns	`0.59`
`array/reductions/mapreduce/Int64/dims=2L`	`2297625` ns	`3609417` ns	`0.64`
`array/reductions/mapreduce/Float32/1d`	`520167` ns	`1017625` ns	`0.51`
`array/reductions/mapreduce/Float32/dims=1`	`395000` ns	`797250` ns	`0.50`
`array/reductions/mapreduce/Float32/dims=2`	`444959` ns	`797770.5` ns	`0.56`
`array/reductions/mapreduce/Float32/dims=1L`	`750729` ns	`1305708` ns	`0.57`
`array/reductions/mapreduce/Float32/dims=2L`	`1285542` ns	`1794125` ns	`0.72`
`array/private/copyto!/gpu_to_gpu`	`238041` ns	`629917` ns	`0.38`
`array/private/copyto!/cpu_to_gpu`	`261667` ns	`786125` ns	`0.33`
`array/private/copyto!/gpu_to_cpu`	`263542` ns	`775125` ns	`0.34`
`array/private/iteration/findall/int`	`1243084` ns	`1538333` ns	`0.81`
`array/private/iteration/findall/bool`	`1102500` ns	`1423333` ns	`0.77`
`array/private/iteration/findfirst/int`	`1193375` ns	`2029791` ns	`0.59`
`array/private/iteration/findfirst/bool`	`1192333` ns	`2010458` ns	`0.59`
`array/private/iteration/scalar`	`1787458` ns	`4660542` ns	`0.38`
`array/private/iteration/logical`	`1642687.5` ns	`2547167` ns	`0.64`
`array/private/iteration/findmin/1d`	`1423584` ns	`2461875` ns	`0.58`
`array/private/iteration/findmin/2d`	`1209375` ns	`1788250` ns	`0.68`
`array/private/copy`	`346000` ns	`597979` ns	`0.58`
`array/shared/copyto!/gpu_to_gpu`	`86292` ns	`82583` ns	`1.04`
`array/shared/copyto!/cpu_to_gpu`	`86959` ns	`82166` ns	`1.06`
`array/shared/copyto!/gpu_to_cpu`	`83541` ns	`83125` ns	`1.01`
`array/shared/iteration/findall/int`	`1241833` ns	`1559792` ns	`0.80`
`array/shared/iteration/findall/bool`	`1111375` ns	`1436625` ns	`0.77`
`array/shared/iteration/findfirst/int`	`1004833` ns	`1621791` ns	`0.62`
`array/shared/iteration/findfirst/bool`	`1003625` ns	`1614209` ns	`0.62`
`array/shared/iteration/scalar`	`209583` ns	`207042` ns	`1.01`
`array/shared/iteration/logical`	`1487250` ns	`2451874.5` ns	`0.61`
`array/shared/iteration/findmin/1d`	`1238917` ns	`2077833` ns	`0.60`
`array/shared/iteration/findmin/2d`	`1207584` ns	`1775937.5` ns	`0.68`
`array/shared/copy`	`350292` ns	`237375` ns	`1.48`
`array/permutedims/4d`	`1774124.5` ns	`2358167` ns	`0.75`
`array/permutedims/2d`	`545917` ns	`1150292` ns	`0.47`
`array/permutedims/3d`	`1146041` ns	`1643708` ns	`0.70`
`metal/synchronization/stream`	`18625` ns	`19500` ns	`0.96`
`metal/synchronization/context`	`18875` ns	`20166` ns	`0.94`

This comment was automatically generated by workflow using github-action-benchmark.

skip scripts tests on 1.10 Project.toml

This reverts commit f4dc915.

This reverts commit d6b12f4.

christiangnrd force-pushed the kaintr branch 3 times, most recently from 9ac3d49 to 6314372 Compare October 22, 2025 04:31

github-actions bot reviewed Oct 22, 2025

View reviewed changes

christiangnrd force-pushed the kaintr branch 3 times, most recently from 22e754e to 68db9c2 Compare October 22, 2025 13:38

christiangnrd force-pushed the kaintr branch 2 times, most recently from 2b8dce1 to 0e76668 Compare November 2, 2025 21:16

christiangnrd mentioned this pull request Nov 2, 2025

Test GPUArrays reverse #648

Draft

christiangnrd force-pushed the kaintr branch 9 times, most recently from db9a7dc to c802ccc Compare November 6, 2025 22:27

christiangnrd force-pushed the kaintr branch 6 times, most recently from 4b8f026 to ce67b4c Compare November 18, 2025 01:36

christiangnrd force-pushed the kaintr branch from ce67b4c to 03bb0dd Compare December 12, 2025 20:39

christiangnrd force-pushed the kaintr branch from d9cd725 to 415079d Compare January 19, 2026 02:44

christiangnrd added 2 commits January 19, 2026 15:55

Tweak scripts test

a67788e

KernelIntrinsics

c04d9f7

christiangnrd added 7 commits January 19, 2026 15:55

Temp CI

9da91be

skip scripts tests on 1.10 Project.toml

Dogfood

985e295

Adapt

af2eac0

temp

77a6511

Revert "temp"

f7db0e7

This reverts commit f4dc915.

Revert "Adapt"

0ded049

This reverts commit d6b12f4.

Better workaround

90e4fb2

christiangnrd force-pushed the kaintr branch from 415079d to 90e4fb2 Compare January 19, 2026 19:56

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Do not merge] Test `KernelIntrinsics` #688

[Do not merge] Test `KernelIntrinsics` #688

Uh oh!

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 •

edited

Loading

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

[Do not merge] Test KernelIntrinsics #688

Are you sure you want to change the base?

[Do not merge] Test KernelIntrinsics #688

Uh oh!

Conversation

christiangnrd commented Oct 22, 2025

Uh oh!

github-actions bot commented Oct 22, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Metal Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

[Do not merge] Test `KernelIntrinsics` #688

[Do not merge] Test `KernelIntrinsics` #688

github-actions bot commented Oct 22, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading