Open
Description
My starting point was the following Rust code (meaning is hopefully clear even to non-rust speakers):
pub fn src(v: u32) -> u32 {
(u32::BITS - v.leading_zeros()).div_ceil(8)
}
rustc emits the following LLVM IR:
define noundef range(i32 0, 6) i32 @src(i32 noundef %v) unnamed_addr #0 {
start:
%0 = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %v, i1 false)
%_2 = sub nuw nsw i32 32, %0
%_41 = lshr i32 %_2, 3
%_5 = and i32 %_2, 7
%_6.not = icmp ne i32 %_5, 0
%1 = zext i1 %_6.not to i32
%_0.sroa.0.0 = add nuw nsw i32 %_41, %1
ret i32 %_0.sroa.0.0
}
Which emits the following x86:
src: # @src
mov ecx, 63
bsr ecx, edi
xor ecx, -32
add ecx, 33
mov eax, ecx
shr eax, 3
and ecx, 7
cmp ecx, 1
sbb eax, -1
ret
however, this could be validly optimized to the following LLVM-IR:
define noundef range(i32 0, 5) i32 @tgt(i32 noundef %v) unnamed_addr #0 {
start:
%0 = tail call i32 @llvm.ctlz.i32(i32 %v, i1 false)
%1 = sub nuw nsw i32 32, %0
%2 = add nuw nsw i32 %1, 7
%3 = lshr i32 %2, 3
ret i32 %3
}
which produces the following, much tighter x86:
tgt: # @tgt
mov eax, 63
bsr eax, edi
xor eax, -32
add eax, 40
shr eax, 3
ret
alive2 showing that the transformation is valid: https://alive2.llvm.org/ce/z/Ys4qAy
(As a bit of interest, I found the optimized versions using claude. Computers are wild: https://claude.ai/share/d998511d-45ee-4132-bee4-fe7f70350a67)