Open
Description
I am running the Dagger benchmarks on an AMD Zen system with 64 cores. The "raw" benchmarks are working fine, but the "dagger" benchmarks abort with an error (see below).
I am using Julia 1.9:
julia> versioninfo()
Julia Version 1.9.3
Commit bed2cd540a1 (2023-08-24 14:43 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 32 × AMD EPYC 7302 16-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-14.0.6 (ORCJIT, znver2)
Threads: 1 on 32 virtual cores
Environment:
LD_LIBRARY_PATH = /cm/shared/apps/slurm/current/lib64/slurm:/cm/shared/apps/slurm/current/lib64
and a recent version of Dagger (Dagger v0.18.3
).
The error message is:
$ env BENCHMARK=nmf:dagger BENCHMARK_PROCS=8:8 julia +1.9 ~/.julia/packages/Dagger/ZOt9H/benchmarks/benchmark.jl
creating benchmarks for suite nmf, exec dagger, accels String[]
running benchmarks for suite nmf, exec dagger, accels String[]
[ Info: Starting 1 worker Dagger NNMF (scale by 16)
┌ Error: Error running benchmarks for suite nmf, exec dagger, accels String[]
│ exception =
│ ThunkFailedException:
│ Root Exception Type: CapturedException
│ Root Exception:
│ ConcurrencyViolationError("lock must be held")
│ Stacktrace:
│ [1] #wait#621
│ @ ./condition.jl:127
│ [2] wait
│ @ ./condition.jl:125 [inlined]
│ [3] wait_for_conn
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:195
│ [4] check_worker_state
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:170
│ [5] send_msg_
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/messages.jl:172
│ [6] send_msg
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/messages.jl:122 [inlined]
│ [7] #remotecall_fetch#159
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:460
│ [8] remotecall_fetch
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:454
│ [9] #remotecall_fetch#162
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:492 [inlined]
│ [10] remotecall_fetch
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:492 [inlined]
│ [11] JuliaLang/julia#171
│ @ ~/.julia/packages/MemPool/l9nLj/src/datastore.jl:424 [inlined]
│ [12] forwardkeyerror
│ @ ~/.julia/packages/MemPool/l9nLj/src/datastore.jl:409
│ [13] poolget
│ @ ~/.julia/packages/MemPool/l9nLj/src/datastore.jl:423
│ [14] move
│ @ ~/.julia/packages/Dagger/ZOt9H/src/chunks.jl:98
│ [15] move
│ @ ~/.julia/packages/Dagger/ZOt9H/src/chunks.jl:96
│ [16] #invokelatest#2
│ @ ./essentials.jl:819 [inlined]
│ [17] invokelatest
│ @ ./essentials.jl:816 [inlined]
│ [18] macro expansion
│ @ ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:1475 [inlined]
│ [19] JuliaLang/julia#152
│ @ ./task.jl:514
│ Stacktrace:
│ [1] wait
│ @ ./task.jl:349 [inlined]
│ [2] fetch
│ @ ./task.jl:369 [inlined]
│ [3] fetch_report
│ @ ~/.julia/packages/Dagger/ZOt9H/src/sch/util.jl:229
│ [4] do_task
│ @ ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:1502
│ [5] macro expansion
│ @ ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:1243 [inlined]
│ [6] JuliaLang/julia#130
│ @ ./task.jl:134
│ Root Thunk: Thunk(id=410, Dagger.Chunk{typeof(*), MemPool.DRef, OSProc, ProcessScope}(typeof(*), UnitDomain(), MemPool.DRef(1, 962, 0x0000000000000000), OSProc(1), ProcessScope: worker == 2, false)(Thunk[199](alloc, Any[3, (2, 12)]), Thunk[212](alloc, Any[13, (12, 500)])))
│ Inner Thunk: Thunk(id=2419, Dagger.Chunk{Dagger.var"#219#220"{Dagger.DArray{Float32, 2, Blocks{2}, typeof(cat)}, Dagger.DomainBlocks{2}, ArrayDomain{2}, Tuple{Int64, Int64}}, MemPool.DRef, OSProc, ProcessScope}(Dagger.var"#219#220"{Dagger.DArray{Float32, 2, Blocks{2}, typeof(cat)}, Dagger.DomainBlocks{2}, ArrayDomain{2}, Tuple{Int64, Int64}}, UnitDomain(), MemPool.DRef(1, 6989, 0x000000000000b420), OSProc(1), ProcessScope: worker == 2, false)(195 inputs...))
│ This Thunk: Thunk(id=2419, Dagger.Chunk{Dagger.var"#219#220"{Dagger.DArray{Float32, 2, Blocks{2}, typeof(cat)}, Dagger.DomainBlocks{2}, ArrayDomain{2}, Tuple{Int64, Int64}}, MemPool.DRef, OSProc, ProcessScope}(Dagger.var"#219#220"{Dagger.DArray{Float32, 2, Blocks{2}, typeof(cat)}, Dagger.DomainBlocks{2}, ArrayDomain{2}, Tuple{Int64, Int64}}, UnitDomain(), MemPool.DRef(1, 6989, 0x000000000000b420), OSProc(1), ProcessScope: worker == 2, false)(195 inputs...))
│ Stacktrace:
│ [1] fetch(t::Dagger.ThunkFuture; proc::OSProc, raw::Bool)
│ @ Dagger ~/.julia/packages/Dagger/ZOt9H/src/eager_thunk.jl:16
│ [2] fetch
│ @ ~/.julia/packages/Dagger/ZOt9H/src/eager_thunk.jl:11 [inlined]
│ [3] #fetch#75
│ @ ~/.julia/packages/Dagger/ZOt9H/src/eager_thunk.jl:58 [inlined]
│ [4] fetch
│ @ ~/.julia/packages/Dagger/ZOt9H/src/eager_thunk.jl:54 [inlined]
│ [5] fetch(c::Dagger.DArray{Float32, 2, Blocks{2}, typeof(cat)})
│ @ Dagger ~/.julia/packages/Dagger/ZOt9H/src/array/darray.jl:281
│ [6] (::var"#98#101"{Base.RefValue{Any}, Base.RefValue{Any}, Base.RefValue{Any}})()
│ @ Main ~/.julia/packages/Dagger/ZOt9H/benchmarks/suites/nmf.jl:77
│ [7] JuliaLang/julia#21
│ @ ~/.julia/packages/Dagger/ZOt9H/src/options.jl:17 [inlined]
│ [8] (::ScopedValues.var"#1#2"{Dagger.var"#21#22"{var"#98#101"{Base.RefValue{Any}, Base.RefValue{Any}, Base.RefValue{Any}}}})()
│ @ ScopedValues ~/.julia/packages/ScopedValues/92HJZ/src/ScopedValues.jl:163
│ [9] with_logstate(f::Function, logstate::Any)
│ @ Base.CoreLogging ./logging.jl:514
│ [10] with_logger
│ @ ./logging.jl:626 [inlined]
│ [11] enter_scope
│ @ ~/.julia/packages/ScopedValues/92HJZ/src/payloadlogger.jl:17 [inlined]
│ [12] with(::Any, ::Pair{<:ScopedValues.ScopedValue})
│ @ ScopedValues ~/.julia/packages/ScopedValues/92HJZ/src/ScopedValues.jl:162
│ [13] scoped(::Function, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
│ @ ScopedValues ./deprecated.jl:116
│ [14] scoped(::Function, ::Vararg{Any})
│ @ ScopedValues ./deprecated.jl:113
│ [15] with_options
│ @ ~/.julia/packages/Dagger/ZOt9H/src/options.jl:16 [inlined]
│ [16] #with_options#23
│ @ ~/.julia/packages/Dagger/ZOt9H/src/options.jl:20 [inlined]
│ [17] with_options
│ @ ~/.julia/packages/Dagger/ZOt9H/src/options.jl:20 [inlined]
│ [18] macro expansion
│ @ ~/.julia/packages/Dagger/ZOt9H/benchmarks/suites/nmf.jl:76 [inlined]
│ [19] var"##core#419"(scope#295::ProcessScope, X#296::Base.RefValue{Any}, W#297::Base.RefValue{Any}, H#298::Base.RefValue{Any}, _nw::Int64, _scale::Int64)
│ @ Main ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:489
│ [20] var"##sample#420"(::Tuple{ProcessScope, Base.RefValue{Any}, Base.RefValue{Any}, Base.RefValue{Any}}, __params::BenchmarkTools.Parameters)
│ @ Main ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:495
│ [21] _run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; verbose::Bool, pad::String, kwargs::Base.Pairs{Symbol, Integer, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:samples, :seconds, :gcsample), Tuple{Int64, Int64, Bool}}})
│ @ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:99
│ [22] #invokelatest#2
│ @ ./essentials.jl:821 [inlined]
│ [23] invokelatest
│ @ ./essentials.jl:816 [inlined]
│ [24] #run_result#45
│ @ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:34 [inlined]
│ [25] run_result
│ @ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:34 [inlined]
│ [26] run(b::BenchmarkTools.Benchmark, p::BenchmarkTools.Parameters; progressid::Base.UUID, nleaves::Int64, ndone::Int64, kwargs::Base.Pairs{Symbol, Any, NTuple{5, Symbol}, NamedTuple{(:verbose, :pad, :samples, :seconds, :gcsample), Tuple{Bool, String, Int64, Int64, Bool}}})
│ @ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:117
│ [27] run (repeats 2 times)
│ @ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:117 [inlined]
│ [28] macro expansion
│ @ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:135 [inlined]
│ [29] macro expansion
│ @ ./timing.jl:393 [inlined]
│ [30] (::BenchmarkTools.var"#51#52"{Bool, String, Base.Pairs{Symbol, Any, NTuple{6, Symbol}, NamedTuple{(:samples, :seconds, :gcsample, :progressid, :nleaves, :ndone), Tuple{Int64, Int64, Bool, Base.UUID, Int64, Int64}}}, BenchmarkGroup, Tuple{}})(progressid::Base.UUID, nleaves::Int64, ndone::Int64)
│ @ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:134
│ [31] _withprogress(f::BenchmarkTools.var"#51#52"{Bool, String, Base.Pairs{Symbol, Any, NTuple{6, Symbol}, NamedTuple{(:samples, :seconds, :gcsample, :progressid, :nleaves, :ndone), Tuple{Int64, Int64, Bool, Base.UUID, Int64, Int64}}}, BenchmarkGroup, Tuple{}}, name::String, group::BenchmarkGroup; progressid::Base.UUID, nleaves::Int64, ndone::Int64, #unused#::Base.Pairs{Symbol, Integer, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:samples, :seconds, :gcsample), Tuple{Int64, Int64, Bool}}})
│ @ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:73
│ [32] run(::BenchmarkGroup; verbose::Bool, pad::String, kwargs::Base.Pairs{Symbol, Any, NTuple{6, Symbol}, NamedTuple{(:samples, :seconds, :gcsample, :progressid, :nleaves, :ndone), Tuple{Int64, Int64, Bool, Base.UUID, Int64, Int64}}})
│ @ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:125
│ [33] macro expansion
│ @ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:135 [inlined]
│ [34] macro expansion
│ @ ./timing.jl:393 [inlined]
│ [35] (::BenchmarkTools.var"#51#52"{Bool, String, Base.Pairs{Symbol, Integer, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:samples, :seconds, :gcsample), Tuple{Int64, Int64, Bool}}}, BenchmarkGroup, Tuple{}})(progressid::Base.UUID, nleaves::Int64, ndone::Int64)
│ @ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:134
│ [36] _withprogress(f::BenchmarkTools.var"#51#52"{Bool, String, Base.Pairs{Symbol, Integer, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:samples, :seconds, :gcsample), Tuple{Int64, Int64, Bool}}}, BenchmarkGroup, Tuple{}}, name::String, group::BenchmarkGroup; progressid::Nothing, nleaves::Float64, ndone::Float64, #unused#::Base.Pairs{Symbol, Integer, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:samples, :seconds, :gcsample), Tuple{Int64, Int64, Bool}}})
│ @ BenchmarkTools ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:79
│ [37] _withprogress
│ @ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:63 [inlined]
│ [38] #run#50
│ @ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:125 [inlined]
│ [39] run
│ @ ~/.julia/packages/BenchmarkTools/0owsb/src/execution.jl:125 [inlined]
│ [40] main()
│ @ Main ~/.julia/packages/Dagger/ZOt9H/benchmarks/benchmark.jl:223
│ [41] top-level scope
│ @ ~/.julia/packages/Dagger/ZOt9H/benchmarks/benchmark.jl:262
│ [42] include(mod::Module, _path::String)
│ @ Base ./Base.jl:457
│ [43] exec_options(opts::Base.JLOptions)
│ @ Base ./client.jl:307
│ [44] _start()
│ @ Base ./client.jl:522
└ @ Main ~/.julia/packages/Dagger/ZOt9H/benchmarks/benchmark.jl:225
Done!
┌ Warning: Worker 2 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529
┌ Warning: Worker 2 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529
┌ Error: Fatal error on process 1
│ exception =
│ attempt to send to unknown socket
│ Stacktrace:
│ [1] error(s::String)
│ @ Base ./error.jl:35
│ [2] send_msg_unknown(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│ @ Distributed ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/messages.jl:99
│ [3] send_msg_now(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│ @ Distributed ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/messages.jl:115
│ [4] deliver_result(sock::Sockets.TCPSocket, msg::Symbol, oid::Distributed.RRID, value::RemoteException)
│ @ Distributed ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:102
│ [5] macro expansion
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:293 [inlined]
│ [6] (::Distributed.var"#109#111"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})()
│ @ Distributed ./task.jl:514
└ @ Distributed ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:106
┌ Warning: Worker 2 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529
┌ Warning: Worker 5 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529
┌ Error: Fatal error on process 1
│ exception =
│ attempt to send to unknown socket
│ Stacktrace:
│ [1] error(s::String)
│ @ Base ./error.jl:35
│ [2] send_msg_unknown(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│ @ Distributed ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/messages.jl:99
│ [3] send_msg_now(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│ @ Distributed ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/messages.jl:115
│ [4] deliver_result(sock::Sockets.TCPSocket, msg::Symbol, oid::Distributed.RRID, value::RemoteException)
│ @ Distributed ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:102
│ [5] macro expansion
│ @ ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:293 [inlined]
│ [6] (::Distributed.var"#109#111"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})()
│ @ Distributed ./task.jl:514
└ @ Distributed ~/.julia/juliaup/julia-1.9.3+0.x64.linux.gnu/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:106
┌ Warning: Worker 3 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529
┌ Warning: Worker 6 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529
┌ Warning: Worker 7 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529
┌ Warning: Worker 8 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529
┌ Warning: Worker 4 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/ZOt9H/src/sch/Sch.jl:529