From e3ae23b41e7c16db933b8bbe9678095196e49338 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Wed, 21 Apr 2021 17:16:35 +0200
Subject: [PATCH] s2: Search at match end in best mode (#358)

When we have a potential match, see if we can find a better one by searching the offset at of the first 8 mismatched bytes.

Improves compression for all cases.

Before/after pairs:
```
Reading nyc-taxi-data-10M.csv...
Compressing... 3325605752 -> 786648492 [23.65%]; 7.517s, 421.9MB/s
Compressing... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s

Reading adresser.json...
Compressing... 7983034785 -> 380912248 [4.77%]; 3.924s, 1940.0MB/s
Compressing... 7983034785 -> 372523343 [4.67%]; 4.34s, 1754.1MB/s

Reading 10gb.tar...
Compressing... 10065157632 -> 5215462149 [51.82%]; 29.462s, 325.8MB/s
Compressing... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/s

Reading enwik9...
Compressing... 1000000000 -> 373289535 [37.33%]; 4.047s, 235.6MB/s
Compressing... 1000000000 -> 371710958 [37.17%]; 4.513s, 211.3MB/s

Reading sample.tar...
Compressing... 808796160 -> 277822539 [34.35%]; 2.043s, 377.5MB/s
Compressing... 808796160 -> 277710349 [34.34%]; 2.409s, 320.2MB/s
```
---
 s2/README.md      | 10 ++++----
 s2/encode_best.go | 65 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/s2/README.md b/s2/README.md
index d90f96821d..33571f30e9 100644
--- a/s2/README.md
+++ b/s2/README.md
@@ -543,27 +543,27 @@ Some examples compared on 16 core CPU, amd64 assembly used:
 * enwik10
 Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
 Better...  10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
-Best...    10000000000 -> 3649340179 [36.49%]; 40.05s, 238.1MB/s
+Best...    10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
 
 * github-june-2days-2019.json
 Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
 Better...  6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
-Best...    6273951764 -> 845168908 [13.47%]; 8.878s, 673.9MB/s
+Best...    6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
 
 * nyc-taxi-data-10M.csv
 Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
 Better...  3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
-Best...    3325605752 -> 786648492 [23.65%]; 7.628s, 415.8MB/s
+Best...    3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
 
 * 10gb.tar
 Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
 Better...  10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
-Best...    10065157632 -> 5215462149 [51.82%]; 29.977s, 320.2MB/s
+Best...    10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
 
 * consensus.db.10gb
 Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
 Better...  10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
-Best...    10737418240 -> 4280128613 [39.86%]; 41.758s, 245.2MB/s
+Best...    10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
 ```
 
 Decompression speed should be around the same as using the 'better' compression mode. 
diff --git a/s2/encode_best.go b/s2/encode_best.go
index 5ebb366d2c..e6dbf6ed5e 100644
--- a/s2/encode_best.go
+++ b/s2/encode_best.go
@@ -68,6 +68,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
 			offset int
 			s      int
 			length int
+			score  int
 			rep    bool
 		}
 		var best match
@@ -82,6 +83,20 @@ func encodeBlockBest(dst, src []byte) (d int) {
 			candidateL := lTable[hashL]
 			candidateS := sTable[hashS]
 
+			score := func(m match) int {
+				// Matches that are longer forward are penalized since we must emit it as a literal.
+				score := m.length - m.s
+				if nextEmit == m.s {
+					// If we do not have to emit literals, we save 1 byte
+					score++
+				}
+				offset := m.s - m.offset
+				if m.rep {
+					return score - emitRepeatSize(offset, m.length)
+				}
+				return score - emitCopySize(offset, m.length)
+			}
+
 			matchAt := func(offset, s int, first uint32, rep bool) match {
 				if best.length != 0 && best.s-best.offset == s-offset {
 					// Don't retest if we have the same offset.
@@ -101,21 +116,14 @@ func encodeBlockBest(dst, src []byte) (d int) {
 					m.length += 8
 				}
 				m.length -= offset
-				return m
-			}
-			score := func(m match, otherS int) int {
-				// Matches that are longer forward are penalized since we must emit it as a literal.
-				score := m.length - (m.s - otherS)
-				if nextEmit == m.s {
-					// If we do not have to emit literals, we save 1 byte
-					score++
-				}
-				offset := m.s - m.offset
-				if m.rep {
-					return score - emitRepeatSize(offset, m.length)
+				m.score = score(m)
+				if m.score <= -m.s {
+					// Eliminate if no savings, we might find a better one.
+					m.length = 0
 				}
-				return score - emitCopySize(offset, m.length)
+				return m
 			}
+
 			bestOf := func(a, b match) match {
 				if b.length == 0 {
 					return a
@@ -123,19 +131,11 @@ func encodeBlockBest(dst, src []byte) (d int) {
 				if a.length == 0 {
 					return b
 				}
-				as := score(a, b.s)
-				bs := score(b, a.s)
+				as := a.score + b.s
+				bs := b.score + a.s
 				if as >= bs {
-					if as <= 0 {
-						// Eliminate if no savings, we might find a better one.
-						a.length = 0
-					}
 					return a
 				}
-				if bs <= 0 {
-					// Eliminate if no savings, we might find a better one.
-					b.length = 0
-				}
 				return b
 			}
 
@@ -159,7 +159,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
 					best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
 
 					// s+2
-					if best.length < 100 {
+					if true {
 						nextShort = sTable[hash4(cv>>8, sTableBits)]
 						s++
 						cv = load64(src, s)
@@ -169,6 +169,23 @@ func encodeBlockBest(dst, src []byte) (d int) {
 						best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
 						best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
 					}
+					// Search for a match at best match end, see if that is better.
+					if sAt := best.s + best.length; sAt < sLimit {
+						sBack := best.s
+						backL := best.length
+						// Load initial values
+						cv = load64(src, sBack)
+						// Search for mismatch
+						next := lTable[hash8(load64(src, sAt), lTableBits)]
+						//next := sTable[hash4(load64(src, sAt), sTableBits)]
+
+						if checkAt := getCur(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+						}
+						if checkAt := getPrev(next) - backL; checkAt > 0 {
+							best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
+						}
+					}
 				}
 			}