From e3ae23b41e7c16db933b8bbe9678095196e49338 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 21 Apr 2021 17:16:35 +0200 Subject: [PATCH] s2: Search at match end in best mode (#358) When we have a potential match, see if we can find a better one by searching the offset at of the first 8 mismatched bytes. Improves compression for all cases. Before/after pairs: ``` Reading nyc-taxi-data-10M.csv... Compressing... 3325605752 -> 786648492 [23.65%]; 7.517s, 421.9MB/s Compressing... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s Reading adresser.json... Compressing... 7983034785 -> 380912248 [4.77%]; 3.924s, 1940.0MB/s Compressing... 7983034785 -> 372523343 [4.67%]; 4.34s, 1754.1MB/s Reading 10gb.tar... Compressing... 10065157632 -> 5215462149 [51.82%]; 29.462s, 325.8MB/s Compressing... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/s Reading enwik9... Compressing... 1000000000 -> 373289535 [37.33%]; 4.047s, 235.6MB/s Compressing... 1000000000 -> 371710958 [37.17%]; 4.513s, 211.3MB/s Reading sample.tar... Compressing... 808796160 -> 277822539 [34.35%]; 2.043s, 377.5MB/s Compressing... 808796160 -> 277710349 [34.34%]; 2.409s, 320.2MB/s ``` --- s2/README.md | 10 ++++---- s2/encode_best.go | 65 ++++++++++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 29 deletions(-) diff --git a/s2/README.md b/s2/README.md index d90f96821d..33571f30e9 100644 --- a/s2/README.md +++ b/s2/README.md @@ -543,27 +543,27 @@ Some examples compared on 16 core CPU, amd64 assembly used: * enwik10 Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s -Best... 10000000000 -> 3649340179 [36.49%]; 40.05s, 238.1MB/s +Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s * github-june-2days-2019.json Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s -Best... 6273951764 -> 845168908 [13.47%]; 8.878s, 673.9MB/s +Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s * nyc-taxi-data-10M.csv Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s -Best... 3325605752 -> 786648492 [23.65%]; 7.628s, 415.8MB/s +Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s * 10gb.tar Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s -Best... 10065157632 -> 5215462149 [51.82%]; 29.977s, 320.2MB/s +Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/ * consensus.db.10gb Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s -Best... 10737418240 -> 4280128613 [39.86%]; 41.758s, 245.2MB/s +Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s ``` Decompression speed should be around the same as using the 'better' compression mode. diff --git a/s2/encode_best.go b/s2/encode_best.go index 5ebb366d2c..e6dbf6ed5e 100644 --- a/s2/encode_best.go +++ b/s2/encode_best.go @@ -68,6 +68,7 @@ func encodeBlockBest(dst, src []byte) (d int) { offset int s int length int + score int rep bool } var best match @@ -82,6 +83,20 @@ func encodeBlockBest(dst, src []byte) (d int) { candidateL := lTable[hashL] candidateS := sTable[hashS] + score := func(m match) int { + // Matches that are longer forward are penalized since we must emit it as a literal. + score := m.length - m.s + if nextEmit == m.s { + // If we do not have to emit literals, we save 1 byte + score++ + } + offset := m.s - m.offset + if m.rep { + return score - emitRepeatSize(offset, m.length) + } + return score - emitCopySize(offset, m.length) + } + matchAt := func(offset, s int, first uint32, rep bool) match { if best.length != 0 && best.s-best.offset == s-offset { // Don't retest if we have the same offset. @@ -101,21 +116,14 @@ func encodeBlockBest(dst, src []byte) (d int) { m.length += 8 } m.length -= offset - return m - } - score := func(m match, otherS int) int { - // Matches that are longer forward are penalized since we must emit it as a literal. - score := m.length - (m.s - otherS) - if nextEmit == m.s { - // If we do not have to emit literals, we save 1 byte - score++ - } - offset := m.s - m.offset - if m.rep { - return score - emitRepeatSize(offset, m.length) + m.score = score(m) + if m.score <= -m.s { + // Eliminate if no savings, we might find a better one. + m.length = 0 } - return score - emitCopySize(offset, m.length) + return m } + bestOf := func(a, b match) match { if b.length == 0 { return a @@ -123,19 +131,11 @@ func encodeBlockBest(dst, src []byte) (d int) { if a.length == 0 { return b } - as := score(a, b.s) - bs := score(b, a.s) + as := a.score + b.s + bs := b.score + a.s if as >= bs { - if as <= 0 { - // Eliminate if no savings, we might find a better one. - a.length = 0 - } return a } - if bs <= 0 { - // Eliminate if no savings, we might find a better one. - b.length = 0 - } return b } @@ -159,7 +159,7 @@ func encodeBlockBest(dst, src []byte) (d int) { best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true)) // s+2 - if best.length < 100 { + if true { nextShort = sTable[hash4(cv>>8, sTableBits)] s++ cv = load64(src, s) @@ -169,6 +169,23 @@ func encodeBlockBest(dst, src []byte) (d int) { best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false)) best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false)) } + // Search for a match at best match end, see if that is better. + if sAt := best.s + best.length; sAt < sLimit { + sBack := best.s + backL := best.length + // Load initial values + cv = load64(src, sBack) + // Search for mismatch + next := lTable[hash8(load64(src, sAt), lTableBits)] + //next := sTable[hash4(load64(src, sAt), sTableBits)] + + if checkAt := getCur(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + if checkAt := getPrev(next) - backL; checkAt > 0 { + best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false)) + } + } } }