Skip to content

Inverted movemasks result in redundant logic #89533

Closed
@Validark

Description

@Validark

I wrote this tokenize function: (https://zig.godbolt.org/z/oYosTb1zK)

export fn tokenize(source: [*]const u8) extern struct { start: [*]const u8, end: [*]const u8 } {
    var cur = source[0..];
    const start = cur;

    while (true) {
        const V = @Vector(@bitSizeOf(usize), u8);
        const vec: V = cur[0..@sizeOf(V)].*;

       const identifier_bitstring = ~(@as(usize, @bitCast(vec == @as(V, @splat('_')))));

        cur = cur[@ctz(identifier_bitstring)..];
        if (identifier_bitstring != 0) break;
    }

    // our token span is start..end
    const end = cur;
    return .{ .start = start, .end = end };
}

Next I made the following change:

-       const identifier_bitstring = ~(@as(usize, @bitCast(vec == @as(V, @splat('_')))));
+       const identifier_bitstring =  (@as(usize, @bitCast(vec != @as(V, @splat('_')))));

Unfortunately, this results in different emit.

First version (Zen 4):

.LCPI0_1:
        .byte   95
tokenize1:
        vpbroadcastb    zmm0, byte ptr [rip + .LCPI0_1]
        mov     rax, rdi
        mov     rdx, rdi
.LBB0_1:
        vmovdqu64       zmm1, zmmword ptr [rdx]
        mov     rcx, rdx
        vpcmpneqb       k1, zmm1, zmm0
        vpcmpeqb        k0, zmm1, zmm0 ; do the same work, but this time not inverted, so we can use jb rather than je?
        kmovq   rdx, k1
        tzcnt   rdx, rdx
        add     rdx, rcx
        kortestq        k0, k0
        jb      .LBB0_1
        vzeroupper
        ret

Second version (Zen 4):

LCPI1_1:
        .byte   95
tokenize2:
        vpbroadcastb    zmm0, byte ptr [rip + .LCPI1_1]
        mov     rax, rdi
        mov     rdx, rdi
.LBB1_1:
        vpcmpneqb       k0, zmm0, zmmword ptr [rdx]
        mov     rcx, rdx
        kmovq   rdx, k0
        tzcnt   rdx, rdx
        add     rdx, rcx
        kortestq        k0, k0
        je      .LBB1_1
        vzeroupper
        ret

First version (Zen 3):

.LCPI0_1:
        .byte   95
tokenize1:
        vpbroadcastb    ymm0, byte ptr [rip + .LCPI0_1]
        mov     rax, rdi
        mov     rdx, rdi
.LBB0_1:
        mov     rcx, rdx
        vpcmpeqb        ymm2, ymm0, ymmword ptr [rcx + 32]
        vpcmpeqb        ymm1, ymm0, ymmword ptr [rdx]
        vpmovmskb       esi, ymm2
        vpmovmskb       edx, ymm1
        shl     rsi, 32
        or      rsi, rdx
        mov     rdx, rsi ; preserve non-inverted rsi so we can cmp against -1 later??
        not     rdx
        tzcnt   rdx, rdx
        add     rdx, rcx
        cmp     rsi, -1
        je      .LBB0_1
        vzeroupper
        ret

Second version (Zen 3):

LCPI1_1:
        .byte   95
tokenize2:
        vpbroadcastb    ymm0, byte ptr [rip + .LCPI1_1]
        mov     rax, rdi
        mov     rdx, rdi
.LBB1_1:
        mov     rcx, rdx
        vpcmpeqb        ymm2, ymm0, ymmword ptr [rcx + 32]
        vpcmpeqb        ymm1, ymm0, ymmword ptr [rdx]
        vpmovmskb       esi, ymm2
        vpmovmskb       edx, ymm1
        not     esi
        not     edx ; do 2 not's before combining these bitstrings instead of just doing 1??
        shl     rsi, 32
        or      rsi, rdx
        tzcnt   rdx, rsi
        add     rdx, rcx
        test    rsi, rsi ; use inverted value instead of preserving the non-inverted value and doing cmp -1??
        je      .LBB1_1
        vzeroupper
        ret

https://zig.godbolt.org/z/oYosTb1zK

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions