Description
The generated x86 code can be improved in some fairly simple ways - hoisting computed constants out of loop bodies, and avoiding unnecessary register moves - that have a significant performance impact on tight loops. In the following example those improvements produce a 35% speedup.
Here is an alternate, DFA-based implementation of utf8.Valid
that I have been playing with:
func Valid(x []byte) bool {
state := uint64(1 * 6)
for _, b := range x {
state = dfa[b] >> (state & 63)
}
return (state & 63) == 1*6
}
const (
s00 = 0 | (1*6)<<(1*6)
sC0 = 0
sC2 = 0 | (2*6)<<(1*6)
sE0 = 0 | (3*6)<<(1*6)
sE1 = 0 | (4*6)<<(1*6)
sED = 0 | (5*6)<<(1*6)
sEE = sE1
sF0 = 0 | (6*6)<<(1*6)
sF1 = 0 | (7*6)<<(1*6)
sF4 = 0 | (8*6)<<(1*6)
sF5 = 0
s80 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(7*6) | (4*6)<<(8*6)
s90 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
sA0 = 0 | (1*6)<<(2*6) | (2*6)<<(3*6) | (2*6)<<(4*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
)
var dfa = [256]uint64{
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80,
s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90,
sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
sC0, sC0, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
sE0, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sED, sEE, sEE,
sF0, sF1, sF1, sF1, sF4, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5,
}
There are no big benchmarks of Valid in the package, but here are some that could be added:
var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
var valid1M = bytes.Repeat(valid1k, 1024)
func BenchmarkValid1K(b *testing.B) {
b.SetBytes(int64(len(valid1k)))
for i := 0; i < b.N; i++ {
Valid(valid1k)
}
}
func BenchmarkValid1M(b *testing.B) {
b.SetBytes(int64(len(valid1M)))
for i := 0; i < b.N; i++ {
Valid(valid1M)
}
}
The old Valid implementation runs at around 1450 MB/s.
The implementation above runs at around 1600 MB/s.
Better but not what I had hoped.
It compiles as follows:
% go test -c && go tool objdump -s 'utf8.Valid$' utf8.test
TEXT unicode/utf8.Valid(SB) /Users/rsc/go/src/unicode/utf8/utf8.go
utf8.go:647 0x10789c0 4889442408 MOVQ AX, 0x8(SP)
utf8.go:649 0x10789c5 31c9 XORL CX, CX
utf8.go:649 0x10789c7 ba06000000 MOVL $0x6, DX
utf8.go:649 0x10789cc eb1f JMP 0x10789ed
utf8.go:649 0x10789ce 0fb63408 MOVZX 0(AX)(CX*1), SI
utf8.go:649 0x10789d2 488d7901 LEAQ 0x1(CX), DI
utf8.go:650 0x10789d6 4c8d0543ef1600 LEAQ unicode/utf8.dfa(SB), R8
utf8.go:650 0x10789dd 498b34f0 MOVQ 0(R8)(SI*8), SI
utf8.go:650 0x10789e1 4889d1 MOVQ DX, CX
utf8.go:650 0x10789e4 48d3ee SHRQ CL, SI
utf8.go:649 0x10789e7 4889f9 MOVQ DI, CX
utf8.go:652 0x10789ea 4889f2 MOVQ SI, DX
utf8.go:649 0x10789ed 4839cb CMPQ CX, BX
utf8.go:649 0x10789f0 7fdc JG 0x10789ce
utf8.go:652 0x10789f2 4883e23f ANDQ $0x3f, DX
utf8.go:652 0x10789f6 4883fa06 CMPQ $0x6, DX
utf8.go:652 0x10789fa 0f94c0 SETE AL
utf8.go:652 0x10789fd c3 RET
:-1 0x10789fe cc INT $0x3
:-1 0x10789ff cc INT $0x3
%
Translating this to proper non-regabi assembly I get:
#include "textflag.h"
TEXT ·Valid(SB),NOSPLIT,$0
MOVQ x_base+0(FP),AX // p = &x[0]
MOVQ x_len+8(FP),BX // n = len(x)
XORL CX, CX // i = 0
MOVL $6, DX
JMP loop
body:
MOVBLZX (AX)(CX*1), SI // b = p[i]
LEAQ 1(CX), DI // j = i+1
LEAQ ·dfa(SB), R8
MOVQ (R8)(SI*8), SI // t = dfa[b]
MOVQ DX, CX // CX = state
SHRQ CX, SI // t >>= state&63
MOVQ DI, CX // i = j
MOVQ SI, DX // state = t
loop:
CMPQ CX, BX
JLT body
ANDL $0x3f, DX
CMPL DX, $6
SETEQ AL
MOVB AX, ret+24(FP)
RET
This runs also at about 1600 MB/s.
First optimization: the LEAQ ·dfa(SB), R8
should be hoisted out of the loop body.
(I tried to do this in the Go version with dfa := &dfa
but it got constant propagated away!)
That change brings it up to 1750 MB/s.
Second optimization: use DI for i
instead of CX, to avoid the pressure on CX.
This lets the LEAQ 1(CX), DI
and the later MOVQ DI, CX
collapse to just LEAQ 1(DI), DI
.
That change brings it up to 1900 MB/s.
The body is now:
body:
MOVBLZX (AX)(DI*1), SI // b = p[i]
LEAQ 1(DI), DI // i++
MOVQ (R8)(SI*8), SI // t = dfa[b]
MOVQ DX, CX // CX = state
SHRQ CX, SI // t >>= state&63
MOVQ SI, DX // state = t
Third optimization: since DX
is moving into CX
, do that one instruction earlier, allowing the use of SI
to be optimized into DX
to eliminate the final MOVQ
:
body:
MOVBLZX (AX)(DI*1), SI // b = p[i]
LEAQ 1(DI), DI // i++
MOVQ DX, CX // CX = state
MOVQ (R8)(SI*8), DX // state = dfa[b]
SHRQ CX, DX // state >>= CX&63
I think this ends up being just "compute the shift amount before the shifted value".
That change brings it up to 2150 MB/s.
This is still a direct translation of the Go code: there are no tricks the compiler couldn't do.
For this particular loop, the optimizations make the code run 35% faster.
Final assembly:
#include "textflag.h"
TEXT ·Valid(SB),NOSPLIT,$0
MOVQ x_base+0(FP),AX // p = &x[0]
MOVQ x_len+8(FP),BX // n = len(x)
XORL DI, DI // i = 0
MOVL $6, DX
LEAQ ·dfa(SB), R8
JMP loop
body:
MOVBLZX (AX)(DI*1), SI // b = p[i]
LEAQ 1(DI), DI // i++
MOVQ DX, CX // CX = state
MOVQ (R8)(SI*8), DX // t = dfa[b]
SHRQ CX, DX // t >>= state&63
loop:
CMPQ DI, BX
JLT body
ANDL $0x3f, DX
CMPL DX, $6
SETEQ AL
MOVB AX, ret+24(FP)
RET
Metadata
Metadata
Assignees
Labels
Type
Projects
Status