Closed
Description
This sample code demonstrates the problem.
godbolt link (macos on left/top).
#[allow(improper_ctypes)]
type T = String;
thread_local!{
static X: T = unsafe { init() };
}
extern "C" {
#[inline(never)]
#[cold]
fn init() -> T;
}
pub fn get() -> T {
X.with(|x| x.clone())
}
asm
example::get:
pushq %rbp
movq %rsp, %rbp
pushq %r14
pushq %rbx
subq $64, %rsp
movq %rdi, %r14
movq example::X::__getit::__KEY@TLVP(%rip), %rdi
callq *(%rdi)
cmpb $0, 25(%rax)
jne LBB3_9
movq example::X::__getit::__KEY@TLVP(%rip), %rdi
callq *(%rdi)
cmpb $0, 24(%rax)
jne LBB3_3
movq example::X::__getit::__KEY@TLVP(%rip), %rdi
callq *(%rdi)
movq %rax, %rbx
leaq std::thread::local::fast::destroy_value(%rip), %rsi
movq %rax, %rdi
callq std::sys::unix::fast_thread_local::register_dtor
movb $1, 24(%rbx)
LBB3_3:
movq example::X::__getit::__KEY@TLVP(%rip), %rdi
callq *(%rdi)
cmpq $0, (%rax)
je LBB3_4
LBB3_7:
movq example::X::__getit::__KEY@TLVP(%rip), %rdi
callq *(%rdi)
leaq -40(%rbp), %rdi
movq %rax, %rsi
callq <alloc::string::String as core::clone::Clone>::clone
movq -40(%rbp), %rax
vmovups -32(%rbp), %xmm0
vmovaps %xmm0, -80(%rbp)
testq %rax, %rax
je LBB3_9
movq %rax, (%r14)
vmovaps -80(%rbp), %xmm0
vmovups %xmm0, 8(%r14)
movq %r14, %rax
addq $64, %rsp
popq %rbx
popq %r14
popq %rbp
retq
LBB3_4:
leaq -40(%rbp), %rdi
callq _init
vmovups -40(%rbp), %xmm0
vmovaps %xmm0, -64(%rbp)
movq -24(%rbp), %rcx
movq example::X::__getit::__KEY@TLVP(%rip), %rdi
callq *(%rdi)
movq (%rax), %rdi
movq 8(%rax), %rsi
vmovaps -64(%rbp), %xmm0
vmovups %xmm0, (%rax)
movq %rcx, 16(%rax)
testq %rdi, %rdi
je LBB3_7
testq %rsi, %rsi
je LBB3_7
movl $1, %edx
callq ___rust_dealloc
jmp LBB3_7
LBB3_9:
callq core::result::unwrap_failed
The asm demonstrates that even when the value has been initialized, and the destructor registered, but not yet running, the thread local pointer gets looked up (callq *(%rdi)
) four times!!! On linux that lookup only occurs once.
One potential fix is to insert a read_volatile
inside of __getit()
.
let key = &__KEY;
// make platform specific version of this
let key = $crate::ptr::read_volatile(&key);
key.get()
Which improves the asm to the following:
asm
__ZN17thread_local_test17thread_local_test3get17h69984f7e963f5a1bE:
pushq %rbp
movq %rsp, %rbp
pushq %r14
pushq %rbx
subq $48, %rsp
movq %rdi, %r14
movq __ZN17thread_local_test17thread_local_test3get1X7__getit5__KEY17h1886c2e600469f01E@TLVP(%rip), %rdi
callq *(%rdi)
movq %rax, -48(%rbp)
movq -48(%rbp), %rbx
cmpb $0, 25(%rbx)
jne LBB10_9
cmpb $0, 24(%rbx)
jne LBB10_3
leaq __ZN3std6thread5local4fast13destroy_value17hbc43def25f86e32eE(%rip), %rsi
movq %rbx, %rdi
callq __ZN3std3sys4unix17fast_thread_local13register_dtor17ha35ff2a0753ab802E
movb $1, 24(%rbx)
LBB10_3:
cmpq $0, (%rbx)
je LBB10_4
LBB10_7:
leaq -48(%rbp), %rdi
movq %rbx, %rsi
callq __ZN60_$LT$alloc..string..String$u20$as$u20$core..clone..Clone$GT$5clone17h9234dcb674122143E
movq -48(%rbp), %rax
vmovups -40(%rbp), %xmm0
vmovaps %xmm0, -64(%rbp)
testq %rax, %rax
je LBB10_9
movq %rax, (%r14)
vmovaps -64(%rbp), %xmm0
vmovups %xmm0, 8(%r14)
addq $48, %rsp
popq %rbx
popq %r14
popq %rbp
retq
LBB10_4:
leaq -48(%rbp), %rdi
callq __ZN17thread_local_test17thread_local_test4init17h2e4a5dfd2802b210E
vmovaps -48(%rbp), %xmm0
movq -32(%rbp), %rax
movq (%rbx), %rdi
movq 8(%rbx), %rsi
vmovups %xmm0, (%rbx)
movq %rax, 16(%rbx)
testq %rdi, %rdi
je LBB10_7
testq %rsi, %rsi
je LBB10_7
movl $1, %edx
callq ___rust_dealloc
jmp LBB10_7
LBB10_9:
callq __ZN4core6result13unwrap_failed17h6ad7be40c736aa06E
Benchmark
#[inline(never)]
#[cold]
fn init() -> String {
String::from("hello world")
}
#[bench]
fn thread_local(b: &mut Bencher) {
const ITER_COUNT: usize = 1_000_000;
thread_local! {
static X: String = init();
}
X.with(|_| {});
b.iter(|| {
for _ in 0..ITER_COUNT {
X.with(|x| {
test::black_box(x);
})
}
})
}
Results
current ... bench: 7,128,388 ns/iter
read_volatile ... bench: 2,085,139 ns/iter (+/- 204,564)