Skip to content

Auto-vectorize saturating addition #131601

Open
@Validark

Description

@Validark

This code:

export fn sum(ptr: [*]u64, len: usize) u64 {
    if ((len & 31) != 0 or len == 0) unreachable;
    var a: u64 = 0;

    for (ptr[0..len]) |e|
        a += e;

    return a;
}

Gets auto-vectorized like so (x86-64 znver5):

sum:
        push    rbp
        mov     rbp, rsp
        vpxor   xmm0, xmm0, xmm0
        vpxor   xmm1, xmm1, xmm1
        vpxor   xmm2, xmm2, xmm2
        vpxor   xmm3, xmm3, xmm3
        xor     eax, eax
.LBB0_1:
        vpaddq  zmm0, zmm0, zmmword ptr [rdi + 8*rax]
        vpaddq  zmm1, zmm1, zmmword ptr [rdi + 8*rax + 64]
        vpaddq  zmm2, zmm2, zmmword ptr [rdi + 8*rax + 128]
        vpaddq  zmm3, zmm3, zmmword ptr [rdi + 8*rax + 192]
        add     rax, 32
        cmp     rsi, rax
        jne     .LBB0_1
        vpaddq  zmm0, zmm1, zmm0
        vpaddq  zmm2, zmm3, zmm2
        vpaddq  zmm0, zmm2, zmm0
        vextracti64x4   ymm1, zmm0, 1
        vpaddq  zmm0, zmm0, zmm1
        vextracti128    xmm1, ymm0, 1
        vpaddq  xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 238
        vpaddq  xmm0, xmm0, xmm1
        vmovq   rax, xmm0
        pop     rbp
        vzeroupper
        ret

However, change the += to +|= (saturating addition) and you get no vectorization (Zig Godbolt) (LLVM Godbolt):

sum:
        push    rbp
        mov     rbp, rsp
        xor     ecx, ecx
        xor     eax, eax
        jmp     .LBB0_1
.LBB0_17:
        add     rcx, 8
        cmp     rsi, rcx
        je      .LBB0_18
.LBB0_1:
        mov     r8, rax
        add     r8, qword ptr [rdi + 8*rcx]
        mov     rax, -1
        mov     rdx, -1
        jae     .LBB0_2
        add     rdx, qword ptr [rdi + 8*rcx + 8]
        mov     r8, -1
        jae     .LBB0_4
.LBB0_5:
        add     r8, qword ptr [rdi + 8*rcx + 16]
        mov     rdx, -1
        jae     .LBB0_6
.LBB0_7:
        add     rdx, qword ptr [rdi + 8*rcx + 24]
        mov     r8, -1
        jae     .LBB0_8
.LBB0_9:
        add     r8, qword ptr [rdi + 8*rcx + 32]
        mov     rdx, -1
        jae     .LBB0_10
.LBB0_11:
        add     rdx, qword ptr [rdi + 8*rcx + 40]
        mov     r8, -1
        jae     .LBB0_12
.LBB0_13:
        add     r8, qword ptr [rdi + 8*rcx + 48]
        mov     rdx, -1
        jae     .LBB0_14
.LBB0_15:
        add     rdx, qword ptr [rdi + 8*rcx + 56]
        jb      .LBB0_17
        jmp     .LBB0_16
.LBB0_2:
        mov     rdx, r8
        add     rdx, qword ptr [rdi + 8*rcx + 8]
        mov     r8, -1
        jb      .LBB0_5
.LBB0_4:
        mov     r8, rdx
        add     r8, qword ptr [rdi + 8*rcx + 16]
        mov     rdx, -1
        jb      .LBB0_7
.LBB0_6:
        mov     rdx, r8
        add     rdx, qword ptr [rdi + 8*rcx + 24]
        mov     r8, -1
        jb      .LBB0_9
.LBB0_8:
        mov     r8, rdx
        add     r8, qword ptr [rdi + 8*rcx + 32]
        mov     rdx, -1
        jb      .LBB0_11
.LBB0_10:
        mov     rdx, r8
        add     rdx, qword ptr [rdi + 8*rcx + 40]
        mov     r8, -1
        jb      .LBB0_13
.LBB0_12:
        mov     r8, rdx
        add     r8, qword ptr [rdi + 8*rcx + 48]
        mov     rdx, -1
        jb      .LBB0_15
.LBB0_14:
        mov     rdx, r8
        add     rdx, qword ptr [rdi + 8*rcx + 56]
        jb      .LBB0_17
.LBB0_16:
        mov     rax, rdx
        jmp     .LBB0_17
.LBB0_18:
        pop     rbp
        ret

With more manual vectorization, I can get this emit:

sum2:
        push    rbp
        mov     rbp, rsp
        vpxor   xmm1, xmm1, xmm1
        vpxor   xmm3, xmm3, xmm3
        vpxor   xmm2, xmm2, xmm2
        vpxor   xmm0, xmm0, xmm0
        xor     eax, eax
.LBB1_1:
        vmovdqu64       zmm6, zmmword ptr [rdi + 8*rax + 128]
        vmovdqu64       zmm5, zmmword ptr [rdi + 8*rax + 64]
        vmovdqu64       zmm4, zmmword ptr [rdi + 8*rax]
        vmovdqu64       zmm7, zmmword ptr [rdi + 8*rax + 192]
        add     rax, 32
        vmovdqa64       zmm8, zmm6
        vpternlogq      zmm8, zmm6, zmm6, 15
        vpminuq zmm2, zmm2, zmm8
        vmovdqa64       zmm8, zmm5
        vpternlogq      zmm8, zmm5, zmm5, 15
        vpminuq zmm3, zmm3, zmm8
        vpaddq  zmm2, zmm2, zmm6
        vmovdqa64       zmm6, zmm4
        vpternlogq      zmm6, zmm4, zmm4, 15
        vpaddq  zmm3, zmm3, zmm5
        vmovdqa64       zmm5, zmm7
        vpternlogq      zmm5, zmm7, zmm7, 15
        vpminuq zmm1, zmm1, zmm6
        vpminuq zmm0, zmm0, zmm5
        vpaddq  zmm1, zmm1, zmm4
        vpaddq  zmm0, zmm0, zmm7
        cmp     rsi, rax
        jne     .LBB1_1
        vmovdqa64       zmm4, zmm3
        vpternlogq      zmm4, zmm3, zmm3, 15
        vpminuq zmm1, zmm1, zmm4
        vmovdqa64       zmm4, zmm2
        vpternlogq      zmm4, zmm2, zmm2, 15
        vpaddq  zmm1, zmm1, zmm3
        vpminuq zmm1, zmm1, zmm4
        vpaddq  zmm1, zmm1, zmm2
        vmovdqa64       zmm2, zmm0
        vpternlogq      zmm2, zmm0, zmm0, 15
        vpminuq zmm1, zmm1, zmm2
        vpaddq  zmm0, zmm1, zmm0
        vextracti64x4   ymm1, zmm0, 1
        vmovdqa ymm2, ymm1
        vpternlogq      ymm2, ymm1, ymm1, 15
        vpminuq ymm0, ymm0, ymm2
        vpaddq  ymm0, ymm0, ymm1
        vextracti128    xmm1, ymm0, 1
        vmovdqa xmm2, xmm1
        vpternlogq      xmm2, xmm1, xmm1, 15
        vpminuq xmm0, xmm0, xmm2
        vpaddq  xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 238
        vmovdqa xmm2, xmm1
        vpternlogq      xmm2, xmm1, xmm1, 15
        vpminuq xmm0, xmm0, xmm2
        vpaddq  xmm0, xmm0, xmm1
        vmovq   rax, xmm0
        pop     rbp
        vzeroupper
        ret

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions