-
-
Notifications
You must be signed in to change notification settings - Fork 5.6k
Description
Default RNGs in threads slowdown performance in Julia 1.3.
Here is my environment:
julia> versioninfo(verbose=true)
Julia Version 1.3.0
Commit 46ce4d7 (2019-11-26 06:09 UTC)
Platform Info:
OS: Windows (x86_64-w64-mingw32)
Microsoft Windows [Version 10.0.18362.535]
CPU: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz:
speed user nice sys idle irq
#1 4008 MHz 633296 0 811953 12463687 195718 ticks
#2 4008 MHz 423671 0 328500 13156765 6218 ticks
#3 4008 MHz 935390 0 638296 12335250 6562 ticks
#4 4008 MHz 436546 0 558328 12913859 2953 ticks
#5 4008 MHz 772937 0 654812 12480984 5468 ticks
#6 4008 MHz 499781 0 503546 12905406 3000 ticks
#7 4008 MHz 721312 0 661609 12525812 3750 ticks
#8 4008 MHz 578609 0 338343 12991781 2156 ticksMemory: 15.953498840332031 GB (8580.15625 MB free)
Uptime: 13908.0 sec
Load Avg: 0.0 0.0 0.0
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.1 (ORCJIT, skylake)
Environment:
JULIA_EDITOR = "C:\Users\Earth\AppData\Local\atom\app-1.42.0\atom.exe" -a
JULIA_NUM_THREADS = 8
In Julia 1.3 each thread has its own default global RNG. Calling rand() in each thread is safe. So the following code that calculates pi returns correct result:
function par_pi(n::Int)
hits = zeros(Int, nthreads())
@threads for i in 1:n
x, y = rand(), rand()
hits[threadid()] += (x^2 + y^2 <= 1)
end
4.0 * sum(hits) / n
end
@btime par_pi(10_000_000) #102.228 ms (59 allocations: 7.00 KiB)
But the above code runs 4x slower than the following version that uses manually allocated RNGs:
const threadsRNG = [MersenneTwister() for i in 1:nthreads()]
function par_pi2(n::Int)
hits = zeros(Int, nthreads())
@threads for i in 1:n
rng = threadsRNG[threadid()]
x, y = rand(rng), rand(rng)
hits[threadid()] += (x^2 + y^2 <= 1)
end
4.0 * sum(hits) / n
end
@time par_pi2(10_000_000) #25.503 ms (59 allocations: 7.00 KiB)
What's strange is that lowered versions are identical to each other except for one generated variable name:
julia> @code_lowered par_pi(10^7)
CodeInfo(
1 ─ %1 = Main.nthreads()
│ hits = Main.zeros(Main.Int, %1)
│ range = 1:n
│ %4 = Main.:(var"#390#threadsfor_fun#31")
│ %5 = Core.typeof(hits)
│ %6 = Core.typeof(range)
│ %7 = Core.apply_type(%4, %5, %6)
│ %8 = hits
│ threadsfor_fun = %new(%7, %8, range)
│ %10 = Base.Threads.threadid()
│ %11 = %10 != 1
└── goto #3 if not %11
2 ─ %13 = Base.invokelatest
│ %14 = threadsfor_fun
│ (%13)(%14, true)
└── goto #4
3 ─ %17 = Base.cconvert(Base.Threads.Any, threadsfor_fun)
│ %18 = Base.unsafe_convert(Base.Threads.Any, %17)
└── $(Expr(:foreigncall, :(:jl_threading_run), Nothing, svec(Any), 0, :(:ccall), :(%18), :(%17)))
4 ┄ Base.Threads.nothing
│ %21 = Main.sum(hits)
│ %22 = 4.0 * %21
│ %23 = %22 / n
└── return %23
)
and
julia> @code_lowered par_pi2(10^7)
CodeInfo(
1 ─ %1 = Main.nthreads()
│ hits = Main.zeros(Main.Int, %1)
│ range = 1:n
│ %4 = Main.:(var"#372#threadsfor_fun#30")
│ %5 = Core.typeof(hits)
│ %6 = Core.typeof(range)
│ %7 = Core.apply_type(%4, %5, %6)
│ %8 = hits
│ threadsfor_fun = %new(%7, %8, range)
│ %10 = Base.Threads.threadid()
│ %11 = %10 != 1
└── goto #3 if not %11
2 ─ %13 = Base.invokelatest
│ %14 = threadsfor_fun
│ (%13)(%14, true)
└── goto #4
3 ─ %17 = Base.cconvert(Base.Threads.Any, threadsfor_fun)
│ %18 = Base.unsafe_convert(Base.Threads.Any, %17)
└── $(Expr(:foreigncall, :(:jl_threading_run), Nothing, svec(Any), 0, :(:ccall), :(%18), :(%17)))
4 ┄ Base.Threads.nothing
│ %21 = Main.sum(hits)
│ %22 = 4.0 * %21
│ %23 = %22 / n
└── return %23
)