Skip to content

Commit

Permalink
zstd: Optimize seqdeq amd64 asm
Browse files Browse the repository at this point in the history
copyMemoryPrecise now generates a loop over 16-byte blocks with a single
branchless 16-byte fixup after it.

This is a tiny bit faster on the whole and quite a bit faster for some
inputs. Benchmark results on Intel Core i7-3770K:

	name                                                         old speed      new speed      delta
	Decoder_DecoderSmall/kppkn.gtb.zst-8                          369MB/s ± 0%   374MB/s ± 1%  +1.56%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/geo.protodata.zst-8                      977MB/s ± 0%  1056MB/s ± 1%  +8.17%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/plrabn12.txt.zst-8                       291MB/s ± 0%   289MB/s ± 0%  -0.74%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/lcet10.txt.zst-8                         329MB/s ± 1%   333MB/s ± 0%  +1.23%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/asyoulik.txt.zst-8                       310MB/s ± 0%   310MB/s ± 1%    ~     (p=1.000 n=5+5)
	Decoder_DecoderSmall/alice29.txt.zst-8                        291MB/s ± 0%   291MB/s ± 1%    ~     (p=0.421 n=5+5)
	Decoder_DecoderSmall/html_x_4.zst-8                          2.07GB/s ± 0%  2.15GB/s ± 2%  +4.05%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/paper-100k.pdf.zst-8                    3.58GB/s ± 3%  3.74GB/s ± 1%  +4.31%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/fireworks.jpeg.zst-8                    8.57GB/s ± 0%  8.60GB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecoderSmall/urls.10K.zst-8                           474MB/s ± 1%   507MB/s ± 1%  +6.80%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/html.zst-8                               745MB/s ± 0%   803MB/s ± 0%  +7.68%  (p=0.008 n=5+5)
	Decoder_DecoderSmall/comp-data.bin.zst-8                      399MB/s ± 1%   400MB/s ± 0%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAll/kppkn.gtb.zst-8                             521MB/s ± 0%   521MB/s ± 0%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAll/geo.protodata.zst-8                        1.27GB/s ± 1%  1.29GB/s ± 0%  +1.19%  (p=0.008 n=5+5)
	Decoder_DecodeAll/plrabn12.txt.zst-8                          429MB/s ± 0%   427MB/s ± 0%  -0.51%  (p=0.032 n=5+5)
	Decoder_DecodeAll/lcet10.txt.zst-8                            435MB/s ± 0%   439MB/s ± 0%  +0.94%  (p=0.008 n=5+5)
	Decoder_DecodeAll/asyoulik.txt.zst-8                          438MB/s ± 0%   436MB/s ± 0%  -0.39%  (p=0.008 n=5+5)
	Decoder_DecodeAll/alice29.txt.zst-8                           423MB/s ± 0%   420MB/s ± 1%  -0.72%  (p=0.008 n=5+5)
	Decoder_DecodeAll/html_x_4.zst-8                             1.59GB/s ± 0%  1.59GB/s ± 1%  +0.54%  (p=0.032 n=5+5)
	Decoder_DecodeAll/paper-100k.pdf.zst-8                       4.53GB/s ± 1%  4.54GB/s ± 1%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAll/fireworks.jpeg.zst-8                       9.64GB/s ± 1%  9.57GB/s ± 0%    ~     (p=0.151 n=5+5)
	Decoder_DecodeAll/urls.10K.zst-8                              683MB/s ± 0%   681MB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAll/html.zst-8                                 1.04GB/s ± 1%  1.06GB/s ± 0%  +1.77%  (p=0.008 n=5+5)
	Decoder_DecodeAll/comp-data.bin.zst-8                         398MB/s ± 1%   399MB/s ± 1%    ~     (p=1.000 n=5+5)
	Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/fastest-8    439MB/s ± 0%   437MB/s ± 0%  -0.39%  (p=0.016 n=5+5)
	Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/default-8    448MB/s ± 0%   448MB/s ± 0%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/better-8     478MB/s ± 0%   477MB/s ± 0%    ~     (p=0.151 n=5+5)
	Decoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/best-8       463MB/s ± 0%   460MB/s ± 0%  -0.57%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/e.txt/fastest-8                       9.62GB/s ± 3%  9.66GB/s ± 1%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllFiles/e.txt/default-8                        394MB/s ± 0%   395MB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFiles/e.txt/better-8                         438MB/s ± 0%   442MB/s ± 0%  +0.82%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/e.txt/best-8                           501MB/s ± 0%   506MB/s ± 0%  +1.07%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/fse-artifact3.bin/fastest-8           1.04GB/s ± 0%  1.05GB/s ± 1%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFiles/fse-artifact3.bin/default-8           1.20GB/s ± 1%  1.20GB/s ± 1%    ~     (p=0.095 n=5+5)
	Decoder_DecodeAllFiles/fse-artifact3.bin/better-8            1.01GB/s ± 0%  1.00GB/s ± 1%  -0.82%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/fse-artifact3.bin/best-8               386MB/s ± 0%   383MB/s ± 0%  -0.57%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/gettysburg.txt/fastest-8               271MB/s ± 1%   275MB/s ± 1%  +1.59%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/gettysburg.txt/default-8               224MB/s ± 1%   223MB/s ± 1%    ~     (p=0.222 n=5+5)
	Decoder_DecodeAllFiles/gettysburg.txt/better-8                228MB/s ± 0%   226MB/s ± 0%  -0.89%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/gettysburg.txt/best-8                  223MB/s ± 1%   221MB/s ± 1%  -1.03%  (p=0.016 n=5+5)
	Decoder_DecodeAllFiles/html.txt/fastest-8                     592MB/s ± 1%   611MB/s ± 0%  +3.20%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/html.txt/default-8                     597MB/s ± 0%   607MB/s ± 0%  +1.71%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/html.txt/better-8                      623MB/s ± 0%   633MB/s ± 0%  +1.57%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/html.txt/best-8                        603MB/s ± 0%   610MB/s ± 0%  +1.25%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pi.txt/fastest-8                      9.59GB/s ± 1%  9.70GB/s ± 1%  +1.16%  (p=0.032 n=5+5)
	Decoder_DecodeAllFiles/pi.txt/default-8                       391MB/s ± 0%   393MB/s ± 0%  +0.62%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pi.txt/better-8                        437MB/s ± 1%   441MB/s ± 2%    ~     (p=0.087 n=5+5)
	Decoder_DecodeAllFiles/pi.txt/best-8                          501MB/s ± 0%   507MB/s ± 0%  +1.22%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pngdata.bin/fastest-8                 1.66GB/s ± 1%  1.70GB/s ± 0%  +2.49%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pngdata.bin/default-8                 1.49GB/s ± 0%  1.51GB/s ± 0%  +1.18%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/pngdata.bin/better-8                  1.87GB/s ± 0%  1.90GB/s ± 1%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFiles/pngdata.bin/best-8                    1.44GB/s ± 1%  1.46GB/s ± 0%  +1.75%  (p=0.008 n=5+5)
	Decoder_DecodeAllFiles/sharnd.out/fastest-8                  9.64GB/s ± 1%  9.66GB/s ± 1%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllFiles/sharnd.out/default-8                  9.70GB/s ± 1%  9.70GB/s ± 2%    ~     (p=1.000 n=5+5)
	Decoder_DecodeAllFiles/sharnd.out/better-8                   9.71GB/s ± 1%  9.79GB/s ± 1%    ~     (p=0.151 n=5+5)
	Decoder_DecodeAllFiles/sharnd.out/best-8                     9.76GB/s ± 0%  9.80GB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/fastest-8  1.85GB/s ± 0%  1.85GB/s ± 0%  -0.31%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/default-8  1.86GB/s ± 0%  1.85GB/s ± 0%  -0.47%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/better-8   2.00GB/s ± 0%  2.00GB/s ± 0%  -0.32%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/best-8     1.93GB/s ± 0%  1.93GB/s ± 0%  -0.22%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/e.txt/fastest-8                      37.7GB/s ± 0%  37.5GB/s ± 0%  -0.38%  (p=0.016 n=5+5)
	Decoder_DecodeAllFilesP/e.txt/default-8                      1.68GB/s ± 0%  1.69GB/s ± 0%  +0.55%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/e.txt/better-8                       1.91GB/s ± 0%  1.92GB/s ± 0%  +0.96%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/e.txt/best-8                         2.22GB/s ± 0%  2.25GB/s ± 0%  +1.50%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/fse-artifact3.bin/fastest-8          5.18GB/s ± 0%  5.05GB/s ± 2%  -2.50%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/fse-artifact3.bin/default-8          5.50GB/s ± 1%  5.34GB/s ± 1%  -2.86%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/fse-artifact3.bin/better-8           5.11GB/s ± 0%  5.14GB/s ± 0%  +0.57%  (p=0.016 n=5+5)
	Decoder_DecodeAllFilesP/fse-artifact3.bin/best-8             2.36GB/s ± 0%  2.37GB/s ± 0%  +0.20%  (p=0.032 n=5+5)
	Decoder_DecodeAllFilesP/gettysburg.txt/fastest-8             1.16GB/s ± 0%  1.16GB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllFilesP/gettysburg.txt/default-8             1.09GB/s ± 0%  1.08GB/s ± 0%  -1.19%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/gettysburg.txt/better-8              1.09GB/s ± 0%  1.08GB/s ± 1%  -0.96%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/gettysburg.txt/best-8                1.03GB/s ± 3%  1.02GB/s ± 0%    ~     (p=0.151 n=5+5)
	Decoder_DecodeAllFilesP/html.txt/fastest-8                   2.50GB/s ± 1%  2.56GB/s ± 0%  +2.39%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/html.txt/default-8                   2.51GB/s ± 0%  2.55GB/s ± 0%  +1.69%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/html.txt/better-8                    2.61GB/s ± 0%  2.66GB/s ± 0%  +1.93%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/html.txt/best-8                      2.53GB/s ± 0%  2.56GB/s ± 0%  +1.13%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/pi.txt/fastest-8                     37.8GB/s ± 0%  37.6GB/s ± 0%  -0.44%  (p=0.016 n=5+5)
	Decoder_DecodeAllFilesP/pi.txt/default-8                     1.67GB/s ± 0%  1.68GB/s ± 0%  +0.61%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/pi.txt/better-8                      1.91GB/s ± 0%  1.93GB/s ± 0%  +0.82%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/pi.txt/best-8                        2.23GB/s ± 0%  2.26GB/s ± 0%  +1.35%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/pngdata.bin/fastest-8                6.99GB/s ± 0%  7.00GB/s ± 0%    ~     (p=0.690 n=5+5)
	Decoder_DecodeAllFilesP/pngdata.bin/default-8                6.88GB/s ± 0%  6.87GB/s ± 0%    ~     (p=0.222 n=5+5)
	Decoder_DecodeAllFilesP/pngdata.bin/better-8                 8.49GB/s ± 0%  8.44GB/s ± 1%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAllFilesP/pngdata.bin/best-8                   6.59GB/s ± 1%  6.53GB/s ± 1%  -0.96%  (p=0.032 n=5+5)
	Decoder_DecodeAllFilesP/sharnd.out/fastest-8                 37.8GB/s ± 0%  37.5GB/s ± 0%  -0.86%  (p=0.008 n=5+5)
	Decoder_DecodeAllFilesP/sharnd.out/default-8                 37.9GB/s ± 1%  38.0GB/s ± 1%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAllFilesP/sharnd.out/better-8                  37.9GB/s ± 0%  37.8GB/s ± 2%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllFilesP/sharnd.out/best-8                    37.8GB/s ± 0%  38.0GB/s ± 1%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAllParallel/kppkn.gtb.zst-8                    2.20GB/s ± 0%  2.20GB/s ± 0%    ~     (p=1.000 n=5+5)
	Decoder_DecodeAllParallel/geo.protodata.zst-8                5.37GB/s ± 0%  5.39GB/s ± 0%  +0.35%  (p=0.008 n=5+5)
	Decoder_DecodeAllParallel/plrabn12.txt.zst-8                 1.77GB/s ± 0%  1.76GB/s ± 0%  -0.19%  (p=0.008 n=5+5)
	Decoder_DecodeAllParallel/lcet10.txt.zst-8                   1.90GB/s ± 0%  1.92GB/s ± 0%  +0.80%  (p=0.008 n=5+5)
	Decoder_DecodeAllParallel/asyoulik.txt.zst-8                 1.83GB/s ± 0%  1.83GB/s ± 0%    ~     (p=0.841 n=5+5)
	Decoder_DecodeAllParallel/alice29.txt.zst-8                  1.74GB/s ± 0%  1.74GB/s ± 0%    ~     (p=0.548 n=5+5)
	Decoder_DecodeAllParallel/html_x_4.zst-8                     6.55GB/s ± 0%  6.49GB/s ± 0%  -0.97%  (p=0.008 n=5+5)
	Decoder_DecodeAllParallel/paper-100k.pdf.zst-8               18.3GB/s ± 0%  18.3GB/s ± 0%    ~     (p=0.056 n=5+5)
	Decoder_DecodeAllParallel/fireworks.jpeg.zst-8               37.4GB/s ± 0%  37.2GB/s ± 1%  -0.57%  (p=0.016 n=4+5)
	Decoder_DecodeAllParallel/urls.10K.zst-8                     2.97GB/s ± 0%  2.96GB/s ± 0%    ~     (p=0.310 n=5+5)
	Decoder_DecodeAllParallel/html.zst-8                         4.42GB/s ± 1%  4.43GB/s ± 0%    ~     (p=0.556 n=5+4)
	Decoder_DecodeAllParallel/comp-data.bin.zst-8                1.69GB/s ± 1%  1.70GB/s ± 0%  +0.84%  (p=0.008 n=5+5)
	[Geo mean]                                                   1.77GB/s       1.78GB/s       +0.57%
  • Loading branch information
greatroar committed Jul 3, 2022
1 parent 51e1025 commit cc3f110
Show file tree
Hide file tree
Showing 2 changed files with 506 additions and 249 deletions.
62 changes: 44 additions & 18 deletions zstd/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -1135,9 +1135,9 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
} else {
e.copyMemoryND("1", c.literals, c.outBase, ll)
ADDQ(ll, c.literals)
ADDQ(ll, c.outBase)
}
ADDQ(ll, c.literals)
ADDQ(ll, c.outBase)
ADDQ(ll, c.outPosition)
}

Expand Down Expand Up @@ -1203,7 +1203,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
*/
e.copyMemoryPrecise("4", ptr, c.outBase, ml)
ADDQ(ml, c.outPosition)
ADDQ(ml, c.outBase)
// Note: for the current go tests this branch is taken in 99.53% cases,
// this is why we repeat a little code here.
handleLoop()
Expand All @@ -1219,7 +1218,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
}
*/
e.copyMemoryPrecise("5", ptr, c.outBase, v)
ADDQ(v, c.outBase)
ADDQ(v, c.outPosition)
SUBQ(v, ml)
// fallback to the next block
Expand Down Expand Up @@ -1254,7 +1252,6 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
ADDQ(ml, c.outPosition)
if e.safeMem {
e.copyMemoryPrecise("2", src, c.outBase, ml)
ADDQ(ml, c.outBase)
} else {
dst := GP64()
MOVQ(c.outBase, dst)
Expand Down Expand Up @@ -1312,9 +1309,43 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua
}

// copyMemoryPrecise will copy memory in blocks of 16 bytes,
// without overwriting nor overreading.
// without overreading. It adds length to src and dst,
// preserving length.
func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) {
label := "copy_" + suffix
n := GP64()
MOVQ(length, n)
SUBQ(U8(16), n)
JB(LabelRef("copy_" + suffix + "_small"))

// If length >= 16, copy blocks of 16 bytes and handle any remainder
// by a block copy that overlaps with the last full block.
{
t := XMM()

loop := "copy_" + suffix + "_loop"
Label(loop)
{
MOVUPS(Mem{Base: src}, t)
MOVUPS(t, Mem{Base: dst})
ADDQ(U8(16), src)
ADDQ(U8(16), dst)
SUBQ(U8(16), n)
JAE(LabelRef(loop))
}

// n is now the range [-16,-1].
// -16 means we copy the entire last block again.
// That should happen about 1/16th of the time,
// so we don't bother to check for it.
LEAQ(Mem{Base: src, Index: n, Disp: 16, Scale: 1}, src)
LEAQ(Mem{Base: dst, Index: n, Disp: 16, Scale: 1}, dst)
MOVUPS(Mem{Base: src, Disp: -16}, t)
MOVUPS(t, Mem{Base: dst, Disp: -16})

JMP(LabelRef("copy_" + suffix + "_end"))
}

Label("copy_" + suffix + "_small")
ofs := GP64()
s := Mem{Base: src, Index: ofs, Scale: 1}
d := Mem{Base: dst, Index: ofs, Scale: 1}
Expand Down Expand Up @@ -1351,23 +1382,18 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV

Label("copy_" + suffix + "_qword")
TESTQ(U32(0x8), length)
JZ(LabelRef("copy_" + suffix + "_test"))
JZ(LabelRef("copy_" + suffix + "_add"))

// copy eight bytes if length & 0x08 != 0
MOVQ(s, tmp)
MOVQ(tmp, d)
ADDQ(U8(8), ofs)
JMP(LabelRef("copy_" + suffix + "_test"))

// copy in 16-byte chunks
Label(label)
t := XMM()
MOVUPS(s, t)
MOVUPS(t, d)
ADDQ(U8(16), ofs)
Label("copy_" + suffix + "_test")
CMPQ(ofs, length)
JB(LabelRef(label))
Label("copy_" + suffix + "_add")
ADDQ(length, dst)
ADDQ(length, src)

Label("copy_" + suffix + "_end")
}

// copyOverlappedMemory will copy one byte at the time from src to dst.
Expand Down
Loading

0 comments on commit cc3f110

Please sign in to comment.