Closed
Description
I have some example code which I use to demonstrate how a simple multithreaded workload scales. I found that it inexplicably scales much worse than expected in Julia 1.11 and today's master compared to Julia 1.10.
Example code (click to expand)
function copy_odds_branches!(dst, src)
write_index = 0
@inbounds for i in eachindex(src)
v = src[i]
if isodd(v)
write_index += 1
dst[write_index] = v
end
end
return write_index
end
function frequent_cache_misses()
src = rand(UInt8, 10_000_000)
dst = similar(src)
x = 0
for i in 1:40
x += copy_odds_branches!(dst, src)
end
x
end
function parallel_run(n_jobs)
jobs = Task[]
for _ in 1:n_jobs
push!(jobs, Threads.@spawn frequent_cache_misses())
end
return sum(i -> fetch(i)::Int, jobs)
end
let
parallel_run(1);
for njobs in (1, 4, 8)
@time parallel_run(njobs);
end
end
This should be run in a Julia session with 8 threads available.
Here are the timings for difference versions of Julia:
1.10:
0.972577 seconds (11 allocations: 19.074 MiB)
1.063923 seconds (38 allocations: 76.297 MiB, 0.56% gc time)
1.249448 seconds (74 allocations: 152.594 MiB)
1.11:
1.023349 seconds (12 allocations: 19.074 MiB)
2.079489 seconds (42 allocations: 76.296 MiB, 1.40% gc time)
4.187955 seconds (82 allocations: 152.593 MiB, 0.67% gc time)
1.12 (a few day's old master)
0.982807 seconds (12 allocations: 19.076 MiB, 1.53% gc time)
1.999636 seconds (42 allocations: 76.296 MiB, 1.06% gc time)
4.007706 seconds (82 allocations: 152.591 MiB, 1.55% gc time)