Skip to content

missed optimization for ceiling division with known ranges #142497

@alex

Description

@alex

My starting point was the following Rust code (meaning is hopefully clear even to non-rust speakers):

pub fn src(v: u32) -> u32 {
    (u32::BITS - v.leading_zeros()).div_ceil(8)
}

rustc emits the following LLVM IR:

define noundef range(i32 0, 6) i32 @src(i32 noundef %v) unnamed_addr #0 {
start:
  %0 = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %v, i1 false)
  %_2 = sub nuw nsw i32 32, %0
  %_41 = lshr i32 %_2, 3
  %_5 = and i32 %_2, 7
  %_6.not = icmp ne i32 %_5, 0
  %1 = zext i1 %_6.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %_41, %1
  ret i32 %_0.sroa.0.0
}

Which emits the following x86:

src:                                    # @src
        mov     ecx, 63
        bsr     ecx, edi
        xor     ecx, -32
        add     ecx, 33
        mov     eax, ecx
        shr     eax, 3
        and     ecx, 7
        cmp     ecx, 1
        sbb     eax, -1
        ret

however, this could be validly optimized to the following LLVM-IR:

define noundef range(i32 0, 5) i32 @tgt(i32 noundef %v) unnamed_addr #0 {
start:
  %0 = tail call i32 @llvm.ctlz.i32(i32 %v, i1 false)
  %1 = sub nuw nsw i32 32, %0
  %2 = add nuw nsw i32 %1, 7
  %3 = lshr i32 %2, 3
  ret i32 %3
}

which produces the following, much tighter x86:

tgt:                                    # @tgt
        mov     eax, 63
        bsr     eax, edi
        xor     eax, -32
        add     eax, 40
        shr     eax, 3
        ret

alive2 showing that the transformation is valid: https://alive2.llvm.org/ce/z/Ys4qAy

(As a bit of interest, I found the optimized versions using claude. Computers are wild: https://claude.ai/share/d998511d-45ee-4132-bee4-fe7f70350a67)

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions