Description
Rust should pass more structs in registers. Consider these examples: ideally, both functions would execute entirely in registers and wouldn't touch memory:
// Passing small structs by value.
pub fn parameters_by_value(v: (u64, u64)) -> u64 {
v.0 + v.1
}
// Returning small structs by value.
pub fn return_by_value() -> (u64, u64) {
(3, 4)
}
Rust, as of a recent 1.2.0-dev nightly, is unable to pass either of these in registers (see LLVM IR and ASM below). It would be pretty safe to pass and return small structs (ones that fit into <=2 registers) in registers, and is likely to improve performance on average. This is what the System V ABI does.
It would also be nice to exploit Rust's control over aliasing, and where possible also promote reference arguments to registers, i.e. put the u64
values in registers for the following functions:
// Passing small structs by reference.
pub fn parameters_by_ref(v: &(u64, u64)) -> u64 {
v.0 + v.1
}
// Passing small structs by *mutable* reference.
pub fn mutable_parameters_by_ref(v: &mut (u64, u64)) {
v.0 += 1;
v.1 += 2;
}
In the &mut
case, this would mean passing two u64
values in registers as function parameters, and returning two u64
values in registers as the return values (ideally we'd arrange for the parameter registers to match the return registers). Uniqueness of &mut
makes this optimization valid, although we may have to give up on this optimization in cases such as when there are raw pointers present.
Here's a more realistic example where I've wanted Rust to do this:
pub fn skip_whitespace(iter: &mut std::str::Chars) -> u64 {
// Reads as much whitespace as possible from the front of iter, then returns the number of
// characters read.
...
}
This function is too large to justify inlining. I'd like the begin and end pointers of iter
to be kept in registers across the function call.
Probably not surprising to the compiler team, but for completeness here is the LLVM IR of the above snippets, as of today's Rust (1.2.0-dev), compiled in release mode / opt-level=3:
define i64 @_ZN19parameters_by_value20h3d287104250c57c4eaaE({ i64, i64 }* noalias nocapture dereferenceable(16)) unnamed_addr #0 {
entry-block:
%1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 0
%2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 1
%3 = load i64, i64* %1, align 8
%4 = load i64, i64* %2, align 8
%5 = add i64 %4, %3
%6 = bitcast { i64, i64 }* %0 to i8*
tail call void @llvm.lifetime.end(i64 16, i8* %6)
ret i64 %5
}
define void @_ZN15return_by_value20h703d16a2e5f298d6saaE({ i64, i64 }* noalias nocapture sret dereferenceable(16)) unnamed_addr #0 {
entry-block:
%1 = bitcast { i64, i64 }* %0 to i8*
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i64, i64 }* @const1285 to i8*), i64 16, i32 8, i1 false)
ret void
}
define i64 @_ZN17parameters_by_ref20hc9c548b23d173a1aBaaE({ i64, i64 }* noalias nocapture readonly dereferenceable(16)) unnamed_addr #2 {
entry-block:
%1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 0
%2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i64 0, i32 1
%3 = load i64, i64* %1, align 8
%4 = load i64, i64* %2, align 8
%5 = add i64 %4, %3
ret i64 %5
}
define void @_ZN25mutable_parameters_by_ref20h736bc2daba227c43QaaE({ i64, i64 }* noalias nocapture dereferenceable(16)) unnamed_addr #0 {
entry-block:
%1 = bitcast { i64, i64 }* %0 to <2 x i64>*
%2 = load <2 x i64>, <2 x i64>* %1, align 8
%3 = add <2 x i64> %2, <i64 1, i64 2>
%4 = bitcast { i64, i64 }* %0 to <2 x i64>*
store <2 x i64> %3, <2 x i64>* %4, align 8
ret void
}
and here is the ASM:
_ZN19parameters_by_value20h3d287104250c57c4eaaE:
.cfi_startproc
movq 8(%rdi), %rax
addq (%rdi), %rax
retq
_ZN15return_by_value20h703d16a2e5f298d6saaE:
.cfi_startproc
movups const1285(%rip), %xmm0
movups %xmm0, (%rdi)
movq %rdi, %rax
retq
_ZN17parameters_by_ref20hc9c548b23d173a1aBaaE:
.cfi_startproc
movq 8(%rdi), %rax
addq (%rdi), %rax
retq
_ZN25mutable_parameters_by_ref20h736bc2daba227c43QaaE:
.cfi_startproc
movdqu (%rdi), %xmm0
paddq .LCPI3_0(%rip), %xmm0
movdqu %xmm0, (%rdi)
retq