Skip to content

Commit

Permalink
backport greatroar arm64 xxhash (#464)
Browse files Browse the repository at this point in the history
see cespare/xxhash#51

```
benchmark                                                    old ns/op      new ns/op      delta
BenchmarkDecoder_DecoderSmall/kppkn.gtb.zst-16               14740529       14093294       -4.39%
BenchmarkDecoder_DecoderSmall/geo.protodata.zst-16           3003737        3008035        +0.14%
BenchmarkDecoder_DecoderSmall/plrabn12.txt.zst-16            70052218       70885931       +1.19%
BenchmarkDecoder_DecoderSmall/lcet10.txt.zst-16              33424884       34714144       +3.86%
BenchmarkDecoder_DecoderSmall/asyoulik.txt.zst-16            9694846        9563735        -1.35%
BenchmarkDecoder_DecoderSmall/alice29.txt.zst-16             12322865       12782681       +3.73%
BenchmarkDecoder_DecoderSmall/html_x_4.zst-16                7290319        7134523        -2.14%
BenchmarkDecoder_DecoderSmall/paper-100k.pdf.zst-16          825251         816683         -1.04%
BenchmarkDecoder_DecoderSmall/fireworks.jpeg.zst-16          462903         486823         +5.17%
BenchmarkDecoder_DecoderSmall/urls.10K.zst-16                1333159120     1114784390     -16.38%
BenchmarkDecoder_DecoderSmall/html.zst-16                    3117950        3095118        -0.73%
BenchmarkDecoder_DecoderSmall/comp-data.bin.zst-16           272085         271508         -0.21%
BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-16                  1411294        1407632        -0.26%
BenchmarkDecoder_DecodeAll/geo.protodata.zst-16              370870         367499         -0.91%
BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-16               4721330        4718339        -0.06%
BenchmarkDecoder_DecodeAll/lcet10.txt.zst-16                 3517766        3487756        -0.85%
BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-16               1186672        1180367        -0.53%
BenchmarkDecoder_DecodeAll/alice29.txt.zst-16                1498383        1502922        +0.30%
BenchmarkDecoder_DecodeAll/html_x_4.zst-16                   748113         742537         -0.75%
BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-16             99132          98206          -0.93%
BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-16             53805          53209          -1.11%
BenchmarkDecoder_DecodeAll/urls.10K.zst-16                   4041818        4028347        -0.33%
BenchmarkDecoder_DecodeAll/html.zst-16                       387794         383309         -1.16%
BenchmarkDecoder_DecodeAll/comp-data.bin.zst-16              34390          34296          -0.27%
BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16          89510          88785          -0.81%
BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16      23315          23128          -0.80%
BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16       306437         325176         +6.12%
BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16         226433         222589         -1.70%
BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16       74612          74182          -0.58%
BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16        95066          94304          -0.80%
BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16           47498          46946          -1.16%
BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16     6291           6237           -0.86%
BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16     3498           3453           -1.29%
BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16           295850         318916         +7.80%
BenchmarkDecoder_DecodeAllParallel/html.zst-16               24340          24196          -0.59%
BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16      2220           2199           -0.95%

benchmark                                                    old MB/s     new MB/s     speedup
BenchmarkDecoder_DecoderSmall/kppkn.gtb.zst-16               100.03       104.63       1.05x
BenchmarkDecoder_DecoderSmall/geo.protodata.zst-16           315.84       315.39       1.00x
BenchmarkDecoder_DecoderSmall/plrabn12.txt.zst-16            55.03        54.38        0.99x
BenchmarkDecoder_DecoderSmall/lcet10.txt.zst-16              102.14       98.35        0.96x
BenchmarkDecoder_DecoderSmall/asyoulik.txt.zst-16            103.30       104.71       1.01x
BenchmarkDecoder_DecoderSmall/alice29.txt.zst-16             98.74        95.18        0.96x
BenchmarkDecoder_DecoderSmall/html_x_4.zst-16                449.47       459.29       1.02x
BenchmarkDecoder_DecoderSmall/paper-100k.pdf.zst-16          992.67       1003.08      1.01x
BenchmarkDecoder_DecoderSmall/fireworks.jpeg.zst-16          2127.32      2022.80      0.95x
BenchmarkDecoder_DecoderSmall/urls.10K.zst-16                4.21         5.04         1.20x
BenchmarkDecoder_DecoderSmall/html.zst-16                    262.74       264.67       1.01x
BenchmarkDecoder_DecoderSmall/comp-data.bin.zst-16           119.84       120.10       1.00x
BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-16                  130.60       130.94       1.00x
BenchmarkDecoder_DecodeAll/geo.protodata.zst-16              319.76       322.69       1.01x
BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-16               102.06       102.13       1.00x
BenchmarkDecoder_DecodeAll/lcet10.txt.zst-16                 121.31       122.36       1.01x
BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-16               105.49       106.05       1.01x
BenchmarkDecoder_DecodeAll/alice29.txt.zst-16                101.50       101.20       1.00x
BenchmarkDecoder_DecodeAll/html_x_4.zst-16                   547.51       551.62       1.01x
BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-16             1032.97      1042.70      1.01x
BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-16             2287.77      2313.39      1.01x
BenchmarkDecoder_DecodeAll/urls.10K.zst-16                   173.71       174.29       1.00x
BenchmarkDecoder_DecodeAll/html.zst-16                       264.06       267.15       1.01x
BenchmarkDecoder_DecodeAll/comp-data.bin.zst-16              118.52       118.85       1.00x
BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16          2059.22      2076.03      1.01x
BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16      5086.37      5127.46      1.01x
BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16       1572.47      1481.85      0.94x
BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16         1884.68      1917.22      1.02x
BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16       1677.73      1687.46      1.01x
BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16        1599.82      1612.75      1.01x
BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16           8623.46      8724.94      1.01x
BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16     16276.14     16418.01     1.01x
BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16     35194.18     35648.95     1.01x
BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16           2373.12      2201.48      0.93x
BenchmarkDecoder_DecodeAllParallel/html.zst-16               4207.06      4232.02      1.01x
BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16      1835.78      1853.29      1.01x

benchmark                                                    old allocs     new allocs     delta
BenchmarkDecoder_DecoderSmall/kppkn.gtb.zst-16               1              3              +200.00%
BenchmarkDecoder_DecoderSmall/geo.protodata.zst-16           1              1              +0.00%
BenchmarkDecoder_DecoderSmall/plrabn12.txt.zst-16            4              5              +25.00%
BenchmarkDecoder_DecoderSmall/lcet10.txt.zst-16              1              1              +0.00%
BenchmarkDecoder_DecoderSmall/asyoulik.txt.zst-16            1              1              +0.00%
BenchmarkDecoder_DecoderSmall/alice29.txt.zst-16             1              1              +0.00%
BenchmarkDecoder_DecoderSmall/html_x_4.zst-16                1              1              +0.00%
BenchmarkDecoder_DecoderSmall/paper-100k.pdf.zst-16          1              1              +0.00%
BenchmarkDecoder_DecoderSmall/fireworks.jpeg.zst-16          1              1              +0.00%
BenchmarkDecoder_DecoderSmall/urls.10K.zst-16                52             50             -3.85%
BenchmarkDecoder_DecoderSmall/html.zst-16                    1              1              +0.00%
BenchmarkDecoder_DecoderSmall/comp-data.bin.zst-16           1              1              +0.00%
BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-16                  0              0              +0.00%
BenchmarkDecoder_DecodeAll/geo.protodata.zst-16              0              0              +0.00%
BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-16               0              0              +0.00%
BenchmarkDecoder_DecodeAll/lcet10.txt.zst-16                 0              0              +0.00%
BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-16               0              0              +0.00%
BenchmarkDecoder_DecodeAll/alice29.txt.zst-16                0              0              +0.00%
BenchmarkDecoder_DecodeAll/html_x_4.zst-16                   0              0              +0.00%
BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-16             0              0              +0.00%
BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-16             0              0              +0.00%
BenchmarkDecoder_DecodeAll/urls.10K.zst-16                   0              0              +0.00%
BenchmarkDecoder_DecodeAll/html.zst-16                       0              0              +0.00%
BenchmarkDecoder_DecodeAll/comp-data.bin.zst-16              0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16          0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16      0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16       0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16         0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16       0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16        0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16           0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16     0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16     0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16           0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/html.zst-16               0              0              +0.00%
BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16      0              0              +0.00%

benchmark                                                    old bytes     new bytes     delta
BenchmarkDecoder_DecoderSmall/kppkn.gtb.zst-16               89288         113942        +27.61%
BenchmarkDecoder_DecoderSmall/geo.protodata.zst-16           10888         12311         +13.07%
BenchmarkDecoder_DecoderSmall/plrabn12.txt.zst-16            981385        1492955       +52.13%
BenchmarkDecoder_DecoderSmall/lcet10.txt.zst-16              385552        367000        -4.81%
BenchmarkDecoder_DecoderSmall/asyoulik.txt.zst-16            30752         30503         -0.81%
BenchmarkDecoder_DecoderSmall/alice29.txt.zst-16             108837        56868         -47.75%
BenchmarkDecoder_DecoderSmall/html_x_4.zst-16                75923         97336         +28.20%
BenchmarkDecoder_DecoderSmall/paper-100k.pdf.zst-16          48            48            +0.00%
BenchmarkDecoder_DecoderSmall/fireworks.jpeg.zst-16          1964          1909          -2.80%
BenchmarkDecoder_DecoderSmall/urls.10K.zst-16                25456880      25432640      -0.10%
BenchmarkDecoder_DecoderSmall/html.zst-16                    48            48            +0.00%
BenchmarkDecoder_DecoderSmall/comp-data.bin.zst-16           51            48            -5.88%
BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-16                  0             0             +0.00%
BenchmarkDecoder_DecodeAll/geo.protodata.zst-16              5             0             -100.00%
BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-16               2             0             -100.00%
BenchmarkDecoder_DecodeAll/lcet10.txt.zst-16                 26            0             -100.00%
BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-16               0             0             +0.00%
BenchmarkDecoder_DecodeAll/alice29.txt.zst-16                10            0             -100.00%
BenchmarkDecoder_DecodeAll/html_x_4.zst-16                   0             0             +0.00%
BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-16             0             0             +0.00%
BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-16             1             0             -100.00%
BenchmarkDecoder_DecodeAll/urls.10K.zst-16                   0             0             +0.00%
BenchmarkDecoder_DecodeAll/html.zst-16                       0             0             +0.00%
BenchmarkDecoder_DecodeAll/comp-data.bin.zst-16              0             0             +0.00%
BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16          234           231           -1.28%
BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16      38            38            +0.00%
BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16       1991          2287          +14.87%
BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16         1501          1298          -13.52%
BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16       132           130           -1.52%
BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16        214           220           +2.80%
BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16           259           258           -0.39%
BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16     9             9             +0.00%
BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16     6             6             +0.00%
BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16           2868          3358          +17.09%
BenchmarkDecoder_DecodeAllParallel/html.zst-16               34            34            +0.00%
BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16      0             0             +0.00%
```
  • Loading branch information
lizthegrey authored Jan 9, 2022
1 parent f59d5b1 commit 35a5ed5
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 4 deletions.
189 changes: 189 additions & 0 deletions zstd/internal/xxhash/xxhash_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
// +build gc,!purego

#include "textflag.h"

// Register allocation.
#define digest R1
#define h R2 // Return value.
#define p R3 // Input pointer.
#define len R4
#define nblocks R5 // len / 32.
#define prime1 R7
#define prime2 R8
#define prime3 R9
#define prime4 R10
#define prime5 R11
#define v1 R12
#define v2 R13
#define v3 R14
#define v4 R15
#define x1 R20
#define x2 R21
#define x3 R22
#define x4 R23

#define round(acc, x) \
MADD prime2, acc, x, acc \
ROR $64-31, acc \
MUL prime1, acc \

// x = round(0, x).
#define round0(x) \
MUL prime2, x \
ROR $64-31, x \
MUL prime1, x \

#define mergeRound(x) \
round0(x) \
EOR x, h \
MADD h, prime4, prime1, h \

// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
#define blocksLoop() \
LSR $5, len, nblocks \
PCALIGN $16 \
loop: \
LDP.P 32(p), (x1, x2) \
round(v1, x1) \
LDP -16(p), (x3, x4) \
round(v2, x2) \
SUB $1, nblocks \
round(v3, x3) \
round(v4, x4) \
CBNZ nblocks, loop \


// The primes are repeated here to ensure that they're stored
// in a contiguous array, so we can load them with LDP.
DATA primes<> +0(SB)/8, $11400714785074694791
DATA primes<> +8(SB)/8, $14029467366897019727
DATA primes<>+16(SB)/8, $1609587929392839161
DATA primes<>+24(SB)/8, $9650029242287828579
DATA primes<>+32(SB)/8, $2870177450012600261
GLOBL primes<>(SB), NOPTR+RODATA, $40


// func Sum64(b []byte) uint64
TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
LDP b_base+0(FP), (p, len)

LDP primes<> +0(SB), (prime1, prime2)
LDP primes<>+16(SB), (prime3, prime4)
MOVD primes<>+32(SB), prime5

CMP $32, len
CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
BLO afterLoop

ADD prime1, prime2, v1
MOVD prime2, v2
MOVD $0, v3
NEG prime1, v4

blocksLoop()

ROR $64-1, v1, x1
ROR $64-7, v2, x2
ADD x1, x2
ROR $64-12, v3, x3
ROR $64-18, v4, x4
ADD x3, x4
ADD x2, x4, h

mergeRound(v1)
mergeRound(v2)
mergeRound(v3)
mergeRound(v4)

afterLoop:
ADD len, h

TBZ $4, len, try8
LDP.P 16(p), (x1, x2)

round0(x1)
ROR $64-27, h
EOR x1 @> 64-27, h, h
MADD h, prime4, prime1, h

round0(x2)
ROR $64-27, h
EOR x2 @> 64-27, h
MADD h, prime4, prime1, h

try8:
TBZ $3, len, try4
MOVD.P 8(p), x1

round0(x1)
ROR $64-27, h
EOR x1 @> 64-27, h
MADD h, prime4, prime1, h

try4:
TBZ $2, len, try2
MOVWU.P 4(p), x2

MUL prime1, x2
ROR $64-23, h
EOR x2 @> 64-23, h
MADD h, prime3, prime2, h

try2:
TBZ $1, len, try1
MOVHU.P 2(p), x3
AND $255, x3, x1
LSR $8, x3, x2

MUL prime5, x1
ROR $64-11, h
EOR x1 @> 64-11, h
MUL prime1, h

MUL prime5, x2
ROR $64-11, h
EOR x2 @> 64-11, h
MUL prime1, h

try1:
TBZ $0, len, end
MOVBU (p), x4

MUL prime5, x4
ROR $64-11, h
EOR x4 @> 64-11, h
MUL prime1, h

end:
EOR h >> 33, h
MUL prime2, h
EOR h >> 29, h
MUL prime3, h
EOR h >> 32, h

MOVD h, ret+24(FP)
RET


// func writeBlocks(d *Digest, b []byte) int
//
// Assumes len(b) >= 32.
TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
LDP primes<>(SB), (prime1, prime2)

// Load state. Assume v[1-4] are stored contiguously.
MOVD d+0(FP), digest
LDP 0(digest), (v1, v2)
LDP 16(digest), (v3, v4)

LDP b_base+8(FP), (p, len)

blocksLoop()

// Store updated state.
STP (v1, v2), 0(digest)
STP (v3, v4), 16(digest)

BIC $31, len
MOVD len, ret+32(FP)
RET
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
//go:build !appengine && gc && !purego
// +build !appengine,gc,!purego
//go:build (amd64 || arm64) && !appengine && gc && !purego
// +build amd64 arm64
// +build !appengine
// +build gc
// +build !purego

package xxhash

Expand Down
4 changes: 2 additions & 2 deletions zstd/internal/xxhash/xxhash_other.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//go:build !amd64 || appengine || !gc || purego
// +build !amd64 appengine !gc purego
//go:build (!amd64 && !arm64) || appengine || !gc || purego
// +build !amd64,!arm64 appengine !gc purego

package xxhash

Expand Down

0 comments on commit 35a5ed5

Please sign in to comment.