klauspost · klauspost · Aug 2, 2021 · Jul 5, 2021 · Aug 2, 2021 · Aug 2, 2021
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
@@ -37,7 +37,7 @@ jobs:
     - name: Test Race
       env:
         CGO_ENABLED: 1
-      run: go test -cpu="1,4" -short -race ./...
+      run: go test -cpu="1,4" -short -race -v ./...
 
   build-special:
     env:

diff --git a/zstd/README.md b/zstd/README.md
@@ -152,7 +152,7 @@ file    out     level   insize      outsize     millis  mb/s
 silesia.tar zskp    1   211947520   73101992    643     313.87
 silesia.tar zskp    2   211947520   67504318    969     208.38
 silesia.tar zskp    3   211947520   64595893    2007    100.68
-silesia.tar zskp    4   211947520   60995370    7691    26.28
+silesia.tar zskp    4   211947520   60995370    8825    22.90
 
 cgo zstd:
 silesia.tar zstd    1   211947520   73605392    543     371.56
@@ -162,7 +162,7 @@ silesia.tar zstd    9   211947520   60212393    5063    39.92
 
 gzip, stdlib/this package:
 silesia.tar gzstd   1   211947520   80007735    1654    122.21
-silesia.tar gzkp    1   211947520   80369488    1168    173.06
+silesia.tar gzkp    1   211947520   80136201    1152    175.45
 
 GOB stream of binary data. Highly compressible.
 https://files.klauspost.com/compress/gob-stream.7z
@@ -171,13 +171,15 @@ file        out     level   insize  outsize     millis  mb/s
 gob-stream  zskp    1   1911399616  235022249   3088    590.30
 gob-stream  zskp    2   1911399616  205669791   3786    481.34
 gob-stream  zskp    3   1911399616  175034659   9636    189.17
-gob-stream  zskp    4   1911399616  167273881   29337   62.13
+gob-stream  zskp    4   1911399616  165609838   50369   36.19
+
 gob-stream  zstd    1   1911399616  249810424   2637    691.26
 gob-stream  zstd    3   1911399616  208192146   3490    522.31
 gob-stream  zstd    6   1911399616  193632038   6687    272.56
 gob-stream  zstd    9   1911399616  177620386   16175   112.70
+
 gob-stream  gzstd   1   1911399616  357382641   10251   177.82
-gob-stream  gzkp    1   1911399616  362156523   5695    320.08
+gob-stream  gzkp    1   1911399616  359753026   5438    335.20
 
 The test data for the Large Text Compression Benchmark is the first
 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
@@ -187,11 +189,13 @@ file    out level   insize      outsize     millis  mb/s
 enwik9  zskp    1   1000000000  343848582   3609    264.18
 enwik9  zskp    2   1000000000  317276632   5746    165.97
 enwik9  zskp    3   1000000000  292243069   12162   78.41
-enwik9  zskp    4   1000000000  275241169   36430   26.18
+enwik9  zskp    4   1000000000  262183768   82837   11.51
+
 enwik9  zstd    1   1000000000  358072021   3110    306.65
 enwik9  zstd    3   1000000000  313734672   4784    199.35
 enwik9  zstd    6   1000000000  295138875   10290   92.68
 enwik9  zstd    9   1000000000  278348700   28549   33.40
+
 enwik9  gzstd   1   1000000000  382578136   9604    99.30
 enwik9  gzkp    1   1000000000  383825945   6544    145.73
 
@@ -202,13 +206,15 @@ file                        out level   insize      outsize     millis  mb/s
 github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
 github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
 github-june-2days-2019.json zskp    3   6273951764  524340691   34043   175.75
-github-june-2days-2019.json zskp    4   6273951764  503314661   93811   63.78
+github-june-2days-2019.json zskp    4   6273951764  470320075   170190  35.16
+
 github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
 github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
 github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
 github-june-2days-2019.json zstd    9   6273951764  601974523   52413   114.16
+
 github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
-github-june-2days-2019.json gzkp    1   6273951764  1128755542  19236   311.03
+github-june-2days-2019.json gzkp    1   6273951764  1125417694  21788   274.61
 
 VM Image, Linux mint with a few installed applications:
 https://files.klauspost.com/compress/rawstudio-mint14.7z
@@ -217,13 +223,15 @@ file                    out level   insize      outsize     millis  mb/s
 rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
 rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
 rawstudio-mint14.tar    zskp    3   8558382592  3158085214  77675   105.08
-rawstudio-mint14.tar    zskp    4   8558382592  3020370044  404956  20.16
+rawstudio-mint14.tar    zskp    4   8558382592  2965110639  857750  9.52
+
 rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
 rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
 rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
 rawstudio-mint14.tar    zstd    9   8558382592  3160778861  140946  57.91
+
 rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
-rawstudio-mint14.tar    gzkp    1   8558382592  3970463184  41749   195.49
+rawstudio-mint14.tar    gzkp    1   8558382592  3962605659  45113   180.92
 
 CSV data:
 https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
@@ -232,13 +240,15 @@ file                    out level   insize      outsize     millis  mb/s
 nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
 nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
 nyc-taxi-data-10M.csv   zskp    3   3325605752  530289687   25239   125.66
-nyc-taxi-data-10M.csv   zskp    4   3325605752  490907191   65939   48.10
+nyc-taxi-data-10M.csv   zskp    4   3325605752  476268884   135958  23.33
+
 nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
 nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
 nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
 nyc-taxi-data-10M.csv   zstd    9   3325605752  517554797   64565   49.12
+
 nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
-nyc-taxi-data-10M.csv   gzkp    1   3325605752  924718719   16388   193.53
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  922257165   16780   189.00
 ```
 
 ## Decompressor

diff --git a/zstd/dict_test.go b/zstd/dict_test.go
@@ -103,6 +103,9 @@ func TestEncoder_SmallDict(t *testing.T) {
 			}
 			dicts = append(dicts, in)
 			for level := SpeedFastest; level < speedLast; level++ {
+				if isRaceTest && level >= SpeedBestCompression {
+					break
+				}
 				enc, err := NewWriter(nil, WithEncoderConcurrency(1), WithEncoderDict(in), WithEncoderLevel(level), WithWindowSize(1<<17))
 				if err != nil {
 					t.Fatal(err)

diff --git a/zstd/enc_best.go b/zstd/enc_best.go
@@ -6,24 +6,59 @@ package zstd
 
 import (
 	"fmt"
-	"math/bits"
+
+	"github.com/klauspost/compress"
 )
 
 const (
-	bestLongTableBits = 20                     // Bits used in the long match table
+	bestLongTableBits = 22                     // Bits used in the long match table
 	bestLongTableSize = 1 << bestLongTableBits // Size of the table
 	bestLongLen       = 8                      // Bytes used for table hash
 
 	// Note: Increasing the short table bits or making the hash shorter
 	// can actually lead to compression degradation since it will 'steal' more from the
 	// long match table and match offsets are quite big.
 	// This greatly depends on the type of input.
-	bestShortTableBits = 16                      // Bits used in the short match table
+	bestShortTableBits = 18                      // Bits used in the short match table
 	bestShortTableSize = 1 << bestShortTableBits // Size of the table
 	bestShortLen       = 4                       // Bytes used for table hash
 
 )
 
+type match struct {
+	offset int32
+	s      int32
+	length int32
+	rep    int32
+	est    int32
+}
+
+const highScore = 25000
+
+// estBits will estimate output bits from predefined tables.
+func (m *match) estBits(bitsPerByte int32) {
+	mlc := mlCode(uint32(m.length - zstdMinMatch))
+	var ofc uint8
+	if m.rep < 0 {
+		ofc = ofCode(uint32(m.s-m.offset) + 3)
+	} else {
+		ofc = ofCode(uint32(m.rep))
+	}
+	// Cost, excluding
+	ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
+
+	// Add cost of match encoding...
+	m.est = int32(ofTT.outBits + mlTT.outBits)
+	m.est += int32(ofTT.deltaNbBits>>16 + mlTT.deltaNbBits>>16)
+	// Subtract savings compared to literal encoding...
+	m.est -= (m.length * bitsPerByte) >> 10
+	if m.est > 0 {
+		// Unlikely gain..
+		m.length = 0
+		m.est = highScore
+	}
+}
+
 // bestFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
 // The long match table contains the previous entry with the same hash,
 // effectively making it a "chain" of length 2.
@@ -112,6 +147,14 @@ func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
 		return
 	}
 
+	// Use this to estimate literal cost.
+	// Scaled by 10 bits.
+	bitsPerByte := int32((compress.ShannonEntropyBits(src) * 1024) / len(src))
+	// Huffman can never go < 1 bit/byte
+	if bitsPerByte < 1024 {
+		bitsPerByte = 1024
+	}
+
 	// Override src
 	src = e.hist
 	sLimit := int32(len(src)) - inputMargin
@@ -148,29 +191,8 @@ encodeLoop:
 			panic("offset0 was 0")
 		}
 
-		type match struct {
-			offset int32
-			s      int32
-			length int32
-			rep    int32
-		}
-		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
-			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
-				return match{offset: offset, s: s}
-			}
-			return match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
-		}
-
 		bestOf := func(a, b match) match {
-			aScore := b.s - a.s + a.length
-			bScore := a.s - b.s + b.length
-			if a.rep < 0 {
-				aScore = aScore - int32(bits.Len32(uint32(a.offset)))/8
-			}
-			if b.rep < 0 {
-				bScore = bScore - int32(bits.Len32(uint32(b.offset)))/8
-			}
-			if aScore >= bScore {
+			if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
 				return a
 			}
 			return b
@@ -182,17 +204,31 @@ encodeLoop:
 		candidateL := e.longTable[nextHashL]
 		candidateS := e.table[nextHashS]
 
+		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
+			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
+				return match{s: s, est: highScore}
+			}
+			m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
+			m.estBits(bitsPerByte)
+			return m
+		}
+
 		best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
 		best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
 		best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
+
 		if canRepeat && best.length < goodEnough {
-			best = bestOf(best, matchAt(s-offset1+1, s+1, uint32(cv>>8), 1))
-			best = bestOf(best, matchAt(s-offset2+1, s+1, uint32(cv>>8), 2))
-			best = bestOf(best, matchAt(s-offset3+1, s+1, uint32(cv>>8), 3))
+			cv := uint32(cv >> 8)
+			spp := s + 1
+			best = bestOf(best, matchAt(spp-offset1, spp, cv, 1))
+			best = bestOf(best, matchAt(spp-offset2, spp, cv, 2))
+			best = bestOf(best, matchAt(spp-offset3, spp, cv, 3))
 			if best.length > 0 {
-				best = bestOf(best, matchAt(s-offset1+3, s+3, uint32(cv>>24), 1))
-				best = bestOf(best, matchAt(s-offset2+3, s+3, uint32(cv>>24), 2))
-				best = bestOf(best, matchAt(s-offset3+3, s+3, uint32(cv>>24), 3))
+				cv >>= 16
+				spp += 2
+				best = bestOf(best, matchAt(spp-offset1, spp, cv, 1))
+				best = bestOf(best, matchAt(spp-offset2, spp, cv, 2))
+				best = bestOf(best, matchAt(spp-offset3, spp, cv, 3))
 			}
 		}
 		// Load next and check...
@@ -218,12 +254,18 @@ encodeLoop:
 			candidateL = e.longTable[hashLen(cv, bestLongTableBits, bestLongLen)]
 			candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
 
+			// Short at s+1
 			best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+			// Long at s+1, s+2
 			best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
 			best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
 			best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
 			best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
-
+			if false {
+				// Short at s+3.
+				// Too often worse...
+				best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1))
+			}
 			// See if we can find a better match by checking where the current best ends.
 			// Use that offset to see if we can find a better full match.
 			if sAt := best.s + best.length; sAt < sLimit {
@@ -428,7 +470,7 @@ func (e *bestFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	e.Encode(blk, src)
 }
 
-// ResetDict will reset and set a dictionary if not nil
+// Reset will reset and set a dictionary if not nil
 func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
 	e.resetBase(d, singleBlock)
 	if d == nil {

diff --git a/zstd/encoder_test.go b/zstd/encoder_test.go
@@ -31,6 +31,9 @@ type testEncOpt struct {
 func getEncOpts(cMax int) []testEncOpt {
 	var o []testEncOpt
 	for level := speedNotSet + 1; level < speedLast; level++ {
+		if isRaceTest && level >= SpeedBestCompression {
+			break
+		}
 		for conc := 1; conc <= 4; conc *= 2 {
 			for _, wind := range testWindowSizes {
 				addOpt := func(name string, options ...EOption) {
@@ -75,6 +78,7 @@ func TestEncoder_EncodeAllSimple(t *testing.T) {
 	in = append(in, in...)
 	for _, opts := range getEncOpts(4) {
 		t.Run(opts.name, func(t *testing.T) {
+			runtime.GC()
 			e, err := NewWriter(nil, opts.o...)
 			if err != nil {
 				t.Fatal(err)
@@ -172,6 +176,9 @@ func TestEncoder_EncodeAllEncodeXML(t *testing.T) {
 
 	for level := speedNotSet + 1; level < speedLast; level++ {
 		t.Run(level.String(), func(t *testing.T) {
+			if isRaceTest && level >= SpeedBestCompression {
+				t.SkipNow()
+			}
 			e, err := NewWriter(nil, WithEncoderLevel(level))
 			if err != nil {
 				t.Fatal(err)
@@ -291,6 +298,9 @@ func TestEncoder_EncodeAllTwain(t *testing.T) {
 
 	for level := speedNotSet + 1; level < speedLast; level++ {
 		t.Run(level.String(), func(t *testing.T) {
+			if isRaceTest && level >= SpeedBestCompression {
+				t.SkipNow()
+			}
 			for _, windowSize := range testWindowSizes {
 				t.Run(fmt.Sprintf("window:%d", windowSize), func(t *testing.T) {
 					e, err := NewWriter(nil, WithEncoderLevel(level), WithWindowSize(windowSize))
@@ -337,6 +347,9 @@ func TestEncoder_EncodeAllPi(t *testing.T) {
 
 	for level := speedNotSet + 1; level < speedLast; level++ {
 		t.Run(level.String(), func(t *testing.T) {
+			if isRaceTest && level >= SpeedBestCompression {
+				t.SkipNow()
+			}
 			for _, windowSize := range testWindowSizes {
 				t.Run(fmt.Sprintf("window:%d", windowSize), func(t *testing.T) {
 					e, err := NewWriter(nil, WithEncoderLevel(level), WithWindowSize(windowSize))

diff --git a/zstd/race_enabled_test.go b/zstd/race_enabled_test.go
@@ -0,0 +1,10 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+// +build race
+
+package zstd
+
+func init() {
+	isRaceTest = true
+}
diff --git a/zstd/zstd_test.go b/zstd/zstd_test.go
@@ -1,3 +1,6 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
 package zstd
 
 import (
@@ -9,6 +12,8 @@ import (
 	"time"
 )
 
+var isRaceTest bool
+
 func TestMain(m *testing.M) {
 	ec := m.Run()
 	if ec == 0 && runtime.NumGoroutine() > 1 {