Skip to content

Jobserver hangs due to full pipe #9739

Closed
@jorisgio

Description

@jorisgio

Problem

When running cargo build on a machine with 128cores/256HT for CI running on debian with linux 5.8, builds regularly hang.
All rustc process and the cargo process each have a thread trying to write "|" to what i think is the jobserver pipe, but the pipe is full, so the build hangs.
Increasing manually the pipe buffer size in C via /proc/pid/fd/ unblocks everything and the build finishes

Steps
Not sure tbh, besides running big amount of jobs

Possible Solution(s)

Notes
Currently only reproduced on cargo 1.50, but trying to upgrade everything to 1.54 to check

here is one backtrace from cargo

#0  __libc_write (nbytes=1, buf=0x7ffdf74b850f, fd=6) at ../sysdeps/unix/sysv/linux/write.c:26
#1  __libc_write (fd=6, buf=0x7ffdf74b850f, nbytes=1) at ../sysdeps/unix/sysv/linux/write.c:24
#2  0x000055add96cd5a6 in std::sys::unix::fd::FileDesc::write () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/sys/unix/fd.rs:146
#3  std::sys::unix::fs::File::write () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/sys/unix/fs.rs:845
#4  <&std::fs::File as std::io::Write>::write () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/fs.rs:679
#5  0x000055add969f5d7 in jobserver::imp::Client::release ()
#6  0x000055add96a0abd in <jobserver::Acquired as core::ops::drop::Drop>::drop ()
#7  0x000055add90134fe in cargo::core::compiler::job_queue::DrainState::drain_the_queue ()
#8  0x000055add8fcccd3 in std::panic::catch_unwind ()
#9  0x000055add8f3bafe in crossbeam_utils::thread::scope ()
#10 0x000055add9011856 in cargo::core::compiler::job_queue::JobQueue::execute ()
#11 0x000055add8ec3a15 in cargo::core::compiler::context::Context::compile ()
#12 0x000055add914bd01 in cargo::ops::cargo_compile::compile_ws ()
#13 0x000055add914ba5e in cargo::ops::cargo_compile::compile ()
#14 0x000055add8dbd67d in cargo::commands::build::exec ()
#15 0x000055add8d5f9f9 in cargo::cli::main ()
#16 0x000055add8dc7438 in cargo::main ()
#17 0x000055add8db6333 in std::sys_common::backtrace::__rust_begin_short_backtrace ()
#18 0x000055add8db6359 in std::rt::lang_start::{{closure}} ()
#19 0x000055add96df177 in core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once ()
    at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b/library/core/src/ops/function.rs:259
#20 std::panicking::try::do_call () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/panicking.rs:379
#21 std::panicking::try () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/panicking.rs:343
#22 std::panic::catch_unwind () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/panic.rs:396
#23 std::rt::lang_start_internal () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/rt.rs:51
#24 0x000055add8dc9c82 in main () 

And here is an example from rustc

(gdb) info thread
  Id   Target Id                                 Frame 
* 1    Thread 0x7f64866604c0 (LWP 85722) "rustc" 0x00007f648b639495 in __GI___pthread_timedjoin_ex (threadid=140069719504640, thread_return=0x0, abstime=0x0, 
    block=<optimized out>) at pthread_join_common.c:89
  2    Thread 0x7f6485dff700 (LWP 85806) "rustc" syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
  3    Thread 0x7f646517f700 (LWP 90268) "rustc" futex_wait_cancelable (private=0, expected=0, futex_word=0x7f647b40d30c)
    at ../sysdeps/unix/sysv/linux/futex-internal.h:88
  4    Thread 0x7f6464bff700 (LWP 90269) "rustc" __libc_write (nbytes=1, buf=0x7f6464bfc247, fd=6) at ../sysdeps/unix/sysv/linux/write.c:26
(gdb) thread 4
[Switching to thread 4 (Thread 0x7f6464bff700 (LWP 90269))]
#0  __libc_write (nbytes=1, buf=0x7f6464bfc247, fd=6) at ../sysdeps/unix/sysv/linux/write.c:26
26      ../sysdeps/unix/sysv/linux/write.c: No such file or directory.
(gdb) bt
#0  __libc_write (nbytes=1, buf=0x7f6464bfc247, fd=6) at ../sysdeps/unix/sysv/linux/write.c:26
#1  __libc_write (fd=6, buf=0x7f6464bfc247, nbytes=1) at ../sysdeps/unix/sysv/linux/write.c:24
#2  0x00007f648b6e6a26 in std::sys::unix::fd::FileDesc::write () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/sys/unix/fd.rs:146
#3  std::sys::unix::fs::File::write () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/sys/unix/fs.rs:845
#4  <&std::fs::File as std::io::Write>::write () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/fs.rs:679
#5  0x00007f648f0739f9 in <jobserver::Acquired as core::ops::drop::Drop>::drop ()
   from /home/user/.rustup/toolchains/1.50.0-x86_64-unknown-linux-gnu/bin/../lib/librustc_driver-02bb148e88292f22.so
#6  0x00007f648e4aefa9 in std::sys_common::backtrace::__rust_begin_short_backtrace ()
   from /home/user/.rustup/toolchains/1.50.0-x86_64-unknown-linux-gnu/bin/../lib/librustc_driver-02bb148e88292f22.so
#7  0x00007f648e4a9b32 in core::ops::function::FnOnce::call_once{{vtable-shim}} ()
   from /home/user/.rustup/toolchains/1.50.0-x86_64-unknown-linux-gnu/bin/../lib/librustc_driver-02bb148e88292f22.so
#8  0x00007f648b71256a in <alloc::boxed::Box<F,A> as core::ops::function::FnOnce<Args>>::call_once ()
    at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b/library/alloc/src/boxed.rs:1328
#9  <alloc::boxed::Box<F,A> as core::ops::function::FnOnce<Args>>::call_once ()
    at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b/library/alloc/src/boxed.rs:1328
#10 std::sys::unix::thread::Thread::new::thread_start () at /rustc/cb75ad5db02783e8b0222fee363c5f63f7e2cf5b//library/std/src/sys/unix/thread.rs:71
#11 0x00007f648b637fa3 in start_thread (arg=<optimized out>) at pthread_create.c:486
#12 0x00007f648b5574cf in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-jobserverArea: jobserver, concurrency, parallelismC-bugCategory: bug

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions