Skip to content

very bad codegen for thread_local! on OSX #60141

Closed
@mtak-

Description

@mtak-

This sample code demonstrates the problem.
godbolt link (macos on left/top).

#[allow(improper_ctypes)]

type T = String;

thread_local!{
    static X: T = unsafe { init() };
}

extern "C" {
    #[inline(never)]
    #[cold]
    fn init() -> T;
}

pub fn get() -> T {
    X.with(|x| x.clone())
}
asm
example::get:
  pushq %rbp
  movq %rsp, %rbp
  pushq %r14
  pushq %rbx
  subq $64, %rsp
  movq %rdi, %r14
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpb $0, 25(%rax)
  jne LBB3_9
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpb $0, 24(%rax)
  jne LBB3_3
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  movq %rax, %rbx
  leaq std::thread::local::fast::destroy_value(%rip), %rsi
  movq %rax, %rdi
  callq std::sys::unix::fast_thread_local::register_dtor
  movb $1, 24(%rbx)
LBB3_3:
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  cmpq $0, (%rax)
  je LBB3_4
LBB3_7:
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  leaq -40(%rbp), %rdi
  movq %rax, %rsi
  callq <alloc::string::String as core::clone::Clone>::clone
  movq -40(%rbp), %rax
  vmovups -32(%rbp), %xmm0
  vmovaps %xmm0, -80(%rbp)
  testq %rax, %rax
  je LBB3_9
  movq %rax, (%r14)
  vmovaps -80(%rbp), %xmm0
  vmovups %xmm0, 8(%r14)
  movq %r14, %rax
  addq $64, %rsp
  popq %rbx
  popq %r14
  popq %rbp
  retq
LBB3_4:
  leaq -40(%rbp), %rdi
  callq _init
  vmovups -40(%rbp), %xmm0
  vmovaps %xmm0, -64(%rbp)
  movq -24(%rbp), %rcx
  movq example::X::__getit::__KEY@TLVP(%rip), %rdi
  callq *(%rdi)
  movq (%rax), %rdi
  movq 8(%rax), %rsi
  vmovaps -64(%rbp), %xmm0
  vmovups %xmm0, (%rax)
  movq %rcx, 16(%rax)
  testq %rdi, %rdi
  je LBB3_7
  testq %rsi, %rsi
  je LBB3_7
  movl $1, %edx
  callq ___rust_dealloc
  jmp LBB3_7
LBB3_9:
  callq core::result::unwrap_failed

The asm demonstrates that even when the value has been initialized, and the destructor registered, but not yet running, the thread local pointer gets looked up (callq *(%rdi)) four times!!! On linux that lookup only occurs once.

One potential fix is to insert a read_volatile inside of __getit().

let key = &__KEY;
// make platform specific version of this
let key = $crate::ptr::read_volatile(&key);
key.get()

Which improves the asm to the following:

asm
__ZN17thread_local_test17thread_local_test3get17h69984f7e963f5a1bE:
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r14
	pushq	%rbx
	subq	$48, %rsp
	movq	%rdi, %r14
	movq	__ZN17thread_local_test17thread_local_test3get1X7__getit5__KEY17h1886c2e600469f01E@TLVP(%rip), %rdi
	callq	*(%rdi)
	movq	%rax, -48(%rbp)
	movq	-48(%rbp), %rbx
	cmpb	$0, 25(%rbx)
	jne	LBB10_9
	cmpb	$0, 24(%rbx)
	jne	LBB10_3
	leaq	__ZN3std6thread5local4fast13destroy_value17hbc43def25f86e32eE(%rip), %rsi
	movq	%rbx, %rdi
	callq	__ZN3std3sys4unix17fast_thread_local13register_dtor17ha35ff2a0753ab802E
	movb	$1, 24(%rbx)
LBB10_3:
	cmpq	$0, (%rbx)
	je	LBB10_4
LBB10_7:
	leaq	-48(%rbp), %rdi
	movq	%rbx, %rsi
	callq	__ZN60_$LT$alloc..string..String$u20$as$u20$core..clone..Clone$GT$5clone17h9234dcb674122143E
	movq	-48(%rbp), %rax
	vmovups	-40(%rbp), %xmm0
	vmovaps	%xmm0, -64(%rbp)
	testq	%rax, %rax
	je	LBB10_9
	movq	%rax, (%r14)
	vmovaps	-64(%rbp), %xmm0
	vmovups	%xmm0, 8(%r14)
	addq	$48, %rsp
	popq	%rbx
	popq	%r14
	popq	%rbp
	retq
LBB10_4:
	leaq	-48(%rbp), %rdi
	callq	__ZN17thread_local_test17thread_local_test4init17h2e4a5dfd2802b210E
	vmovaps	-48(%rbp), %xmm0
	movq	-32(%rbp), %rax
	movq	(%rbx), %rdi
	movq	8(%rbx), %rsi
	vmovups	%xmm0, (%rbx)
	movq	%rax, 16(%rbx)
	testq	%rdi, %rdi
	je	LBB10_7
	testq	%rsi, %rsi
	je	LBB10_7
	movl	$1, %edx
	callq	___rust_dealloc
	jmp	LBB10_7
LBB10_9:
	callq	__ZN4core6result13unwrap_failed17h6ad7be40c736aa06E

Benchmark

    #[inline(never)]
    #[cold]
    fn init() -> String {
        String::from("hello world")
    }

    #[bench]
    fn thread_local(b: &mut Bencher) {
        const ITER_COUNT: usize = 1_000_000;
        thread_local! {
            static X: String = init();
        }
        X.with(|_| {});
        b.iter(|| {
            for _ in 0..ITER_COUNT {
                X.with(|x| {
                    test::black_box(x);
                })
            }
        })
    }

Results

current       ... bench:   7,128,388 ns/iter
read_volatile ... bench:   2,085,139 ns/iter (+/- 204,564)

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-thread-localsArea: Thread local storage (TLS)I-slowIssue: Problems and improvements with respect to performance of generated code.O-macosOperating system: macOS

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions