Closed
Description
I wrote this tokenize
function: (https://zig.godbolt.org/z/oYosTb1zK)
export fn tokenize(source: [*]const u8) extern struct { start: [*]const u8, end: [*]const u8 } {
var cur = source[0..];
const start = cur;
while (true) {
const V = @Vector(@bitSizeOf(usize), u8);
const vec: V = cur[0..@sizeOf(V)].*;
const identifier_bitstring = ~(@as(usize, @bitCast(vec == @as(V, @splat('_')))));
cur = cur[@ctz(identifier_bitstring)..];
if (identifier_bitstring != 0) break;
}
// our token span is start..end
const end = cur;
return .{ .start = start, .end = end };
}
Next I made the following change:
- const identifier_bitstring = ~(@as(usize, @bitCast(vec == @as(V, @splat('_')))));
+ const identifier_bitstring = (@as(usize, @bitCast(vec != @as(V, @splat('_')))));
Unfortunately, this results in different emit.
First version (Zen 4):
.LCPI0_1:
.byte 95
tokenize1:
vpbroadcastb zmm0, byte ptr [rip + .LCPI0_1]
mov rax, rdi
mov rdx, rdi
.LBB0_1:
vmovdqu64 zmm1, zmmword ptr [rdx]
mov rcx, rdx
vpcmpneqb k1, zmm1, zmm0
vpcmpeqb k0, zmm1, zmm0 ; do the same work, but this time not inverted, so we can use jb rather than je?
kmovq rdx, k1
tzcnt rdx, rdx
add rdx, rcx
kortestq k0, k0
jb .LBB0_1
vzeroupper
ret
Second version (Zen 4):
LCPI1_1:
.byte 95
tokenize2:
vpbroadcastb zmm0, byte ptr [rip + .LCPI1_1]
mov rax, rdi
mov rdx, rdi
.LBB1_1:
vpcmpneqb k0, zmm0, zmmword ptr [rdx]
mov rcx, rdx
kmovq rdx, k0
tzcnt rdx, rdx
add rdx, rcx
kortestq k0, k0
je .LBB1_1
vzeroupper
ret
First version (Zen 3):
.LCPI0_1:
.byte 95
tokenize1:
vpbroadcastb ymm0, byte ptr [rip + .LCPI0_1]
mov rax, rdi
mov rdx, rdi
.LBB0_1:
mov rcx, rdx
vpcmpeqb ymm2, ymm0, ymmword ptr [rcx + 32]
vpcmpeqb ymm1, ymm0, ymmword ptr [rdx]
vpmovmskb esi, ymm2
vpmovmskb edx, ymm1
shl rsi, 32
or rsi, rdx
mov rdx, rsi ; preserve non-inverted rsi so we can cmp against -1 later??
not rdx
tzcnt rdx, rdx
add rdx, rcx
cmp rsi, -1
je .LBB0_1
vzeroupper
ret
Second version (Zen 3):
LCPI1_1:
.byte 95
tokenize2:
vpbroadcastb ymm0, byte ptr [rip + .LCPI1_1]
mov rax, rdi
mov rdx, rdi
.LBB1_1:
mov rcx, rdx
vpcmpeqb ymm2, ymm0, ymmword ptr [rcx + 32]
vpcmpeqb ymm1, ymm0, ymmword ptr [rdx]
vpmovmskb esi, ymm2
vpmovmskb edx, ymm1
not esi
not edx ; do 2 not's before combining these bitstrings instead of just doing 1??
shl rsi, 32
or rsi, rdx
tzcnt rdx, rsi
add rdx, rcx
test rsi, rsi ; use inverted value instead of preserving the non-inverted value and doing cmp -1??
je .LBB1_1
vzeroupper
ret