Description
In my real code, I have a bunch of slices (each is a struct { ptr, len }
) and I want to sum together the len
values to determine the total size. Zig Godbolt
export fn numBytesInFiles(files: [*][]const u8, num_files: usize) usize {
if ((num_files & 31) != 0 or num_files == 0) unreachable; // make emit simpler
var num_bytes: usize = 0;
for (files[0..num_files]) |file_data|
num_bytes += file_data.len;
return num_bytes;
}
By default, this is what I get:
.LCPI0_1:
.quad 32
.LCPI0_2:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 4
.byte 5
.byte 6
.byte 7
numBytesInFiles:
push rbp
mov rbp, rsp
vpmovsxbq zmm1, qword ptr [rip + .LCPI0_2]
vpbroadcastq zmm2, qword ptr [rip + .LCPI0_1]
vpxor xmm0, xmm0, xmm0
vpxor xmm3, xmm3, xmm3
vpxor xmm4, xmm4, xmm4
vpxor xmm5, xmm5, xmm5
.LBB0_1:
vpsllq zmm6, zmm1, 4
kxnorw k1, k0, k0
vpxor xmm7, xmm7, xmm7
vpaddq zmm1, zmm1, zmm2
add rsi, -32
vpgatherqq zmm7 {k1}, zmmword ptr [rdi + zmm6 + 8]
kxnorw k1, k0, k0
vpaddq zmm0, zmm7, zmm0
vpxor xmm7, xmm7, xmm7
vpgatherqq zmm7 {k1}, zmmword ptr [rdi + zmm6 + 136]
kxnorw k1, k0, k0
vpaddq zmm3, zmm7, zmm3
vpxor xmm7, xmm7, xmm7
vpgatherqq zmm7 {k1}, zmmword ptr [rdi + zmm6 + 264]
kxnorw k1, k0, k0
vpaddq zmm4, zmm7, zmm4
vpxor xmm7, xmm7, xmm7
vpgatherqq zmm7 {k1}, zmmword ptr [rdi + zmm6 + 392]
vpaddq zmm5, zmm7, zmm5
jne .LBB0_1
vpaddq zmm0, zmm3, zmm0
vpaddq zmm0, zmm4, zmm0
vpaddq zmm0, zmm5, zmm0
vextracti64x4 ymm1, zmm0, 1
vpaddq zmm0, zmm0, zmm1 ; why is this not a ymm operation?
vextracti128 xmm1, ymm0, 1
vpaddq xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238
vpaddq xmm0, xmm0, xmm1
vmovq rax, xmm0
pop rbp
vzeroupper
ret
There are a number of issues with this emit. There are serial dependency chains that could have been parallel and we are using vpgatherqq
at a high cost for little benefit. On LLVM's Godbolt (I assume a newer version of LLVM) I got this emit: LLVM Godbolt
.LCPI0_1:
.byte 0
.byte 2
.byte 4
.byte 6
.byte 8
.byte 10
.byte 12
.byte 14
numBytesInFiles:
.LnumBytesInFiles$local:
push rbp
mov rbp, rsp
cmp rsi, 33
jae .LBB0_2
xor eax, eax
xor edx, edx
jmp .LBB0_5
.LBB0_2:
vpmovsxbq zmm1, qword ptr [rip + .LCPI0_1]
lea rax, [rsi - 32]
lea rcx, [rdi + 392]
vpxor xmm0, xmm0, xmm0
vpxor xmm2, xmm2, xmm2
vpxor xmm3, xmm3, xmm3
vpxor xmm4, xmm4, xmm4
mov rdx, rax
.LBB0_3:
vmovdqu64 zmm5, zmmword ptr [rcx - 384]
vmovdqu64 zmm6, zmmword ptr [rcx - 256]
vmovdqu64 zmm7, zmmword ptr [rcx - 128]
vmovdqu64 zmm8, zmmword ptr [rcx]
vpermt2q zmm5, zmm1, zmmword ptr [rcx - 320]
vpermt2q zmm6, zmm1, zmmword ptr [rcx - 192]
vpermt2q zmm7, zmm1, zmmword ptr [rcx - 64]
vpermt2q zmm8, zmm1, zmmword ptr [rcx + 64]
add rcx, 512
add rdx, -32
vpaddq zmm0, zmm5, zmm0
vpaddq zmm2, zmm6, zmm2
vpaddq zmm3, zmm7, zmm3
vpaddq zmm4, zmm8, zmm4
jne .LBB0_3
vpaddq zmm0, zmm2, zmm0
vpaddq zmm0, zmm3, zmm0
vpaddq zmm0, zmm4, zmm0
vextracti64x4 ymm1, zmm0, 1
vpaddq zmm0, zmm0, zmm1; why is this not a ymm operation?
vextracti128 xmm1, ymm0, 1
vpaddq xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238
vpaddq xmm0, xmm0, xmm1
vmovq rdx, xmm0; Why do we need all the stuff below???
.LBB0_5:
lea rcx, [rsi - 4]
vmovq xmm0, rdx
mov rdx, rcx
sub rdx, rax
shl rax, 4
lea rax, [rax + rdi + 8]
.LBB0_6:
vmovdqu ymm1, ymmword ptr [rax]
vpunpcklqdq ymm1, ymm1, ymmword ptr [rax + 32]
add rax, 64
add rdx, -4
vpermq ymm1, ymm1, 216
vpaddq ymm0, ymm1, ymm0
jne .LBB0_6
vextracti128 xmm1, ymm0, 1
shl rcx, 4
shl rsi, 4
vpaddq xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238
vpaddq xmm0, xmm0, xmm1
vmovq rax, xmm0
add rax, qword ptr [rdi + rcx + 8]
add rax, qword ptr [rsi + rdi - 40]
add rax, qword ptr [rsi + rdi - 24]
add rax, qword ptr [rsi + rdi - 8]
pop rbp
vzeroupper
ret
I can see what the compiler was going for until LBB0_5
. After that, I am at a loss of what it is doing, and I did not spend enough time walking through each step to see what is happening. But I would prefer an emit similar to this, which I was able to coerce the compiler to do for a size-optimized build:
numBytesInFilesFaster:
push rbp
mov rbp, rsp
shr rsi, 4
vpxor xmm0, xmm0, xmm0
vpxor xmm1, xmm1, xmm1
vpxor xmm2, xmm2, xmm2
vpxor xmm3, xmm3, xmm3
.LBB0_1:
sub rsi, 1
jb .LBB0_3
vpaddq zmm0, zmm0, zmmword ptr [rdi]
vpaddq zmm1, zmm1, zmmword ptr [rdi + 64]
vpaddq zmm2, zmm2, zmmword ptr [rdi + 128]
vpaddq zmm3, zmm3, zmmword ptr [rdi + 192]
add rdi, 256
jmp .LBB0_1
.LBB0_3:
vpaddq zmm0, zmm0, zmm1
vpaddq zmm1, zmm2, zmm3
vpaddq zmm0, zmm0, zmm1
vextracti64x4 ymm1, zmm0, 1
vpaddq ymm0, ymm0, ymm1; Yay! Not a zmm operation!
vextracti128 xmm1, ymm0, 1
vpaddq xmm0, xmm0, xmm1
vpextrq rax, xmm0, 1; just forget about xmm0[0]
pop rbp
vzeroupper
ret
I feel like this strategy makes a lot more sense. I didn't speedtest against the new LLVM version but if I remember correctly, my idea was 2-3x faster than the vpgatherqq
strategy. The vpermt2q
strategy is definitely a lot smarter than the vpgatherqq
strategy, so I am not sure how the two compare. It would be easiest for me to compare with Zig using a newer version of LLVM, and I am not sure what the timeframe for that looks like. Hence, since I am a bit pressed for time at the moment, I didn't want to commit at this exact moment to set up a benchmark but I wanted to get this idea out there in case someone wanted to dig into this idea sooner rather than later.