Skip to content

Commit

Permalink
s2: Search at match end in best mode (#358)
Browse files Browse the repository at this point in the history
When we have a potential match, see if we can find a better one by searching the offset at of the first 8 mismatched bytes.

Improves compression for all cases.

Before/after pairs:
```
Reading nyc-taxi-data-10M.csv...
Compressing... 3325605752 -> 786648492 [23.65%]; 7.517s, 421.9MB/s
Compressing... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s

Reading adresser.json...
Compressing... 7983034785 -> 380912248 [4.77%]; 3.924s, 1940.0MB/s
Compressing... 7983034785 -> 372523343 [4.67%]; 4.34s, 1754.1MB/s

Reading 10gb.tar...
Compressing... 10065157632 -> 5215462149 [51.82%]; 29.462s, 325.8MB/s
Compressing... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/s

Reading enwik9...
Compressing... 1000000000 -> 373289535 [37.33%]; 4.047s, 235.6MB/s
Compressing... 1000000000 -> 371710958 [37.17%]; 4.513s, 211.3MB/s

Reading sample.tar...
Compressing... 808796160 -> 277822539 [34.35%]; 2.043s, 377.5MB/s
Compressing... 808796160 -> 277710349 [34.34%]; 2.409s, 320.2MB/s
```
  • Loading branch information
klauspost authored Apr 21, 2021
1 parent 7acff5b commit e3ae23b
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 29 deletions.
10 changes: 5 additions & 5 deletions s2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -543,27 +543,27 @@ Some examples compared on 16 core CPU, amd64 assembly used:
* enwik10
Default... 10000000000 -> 4761467548 [47.61%]; 1.098s, 8685.6MB/s
Better... 10000000000 -> 4219438251 [42.19%]; 1.925s, 4954.2MB/s
Best... 10000000000 -> 3649340179 [36.49%]; 40.05s, 238.1MB/s
Best... 10000000000 -> 3627364337 [36.27%]; 43.051s, 221.5MB/s
* github-june-2days-2019.json
Default... 6273951764 -> 1043196283 [16.63%]; 431ms, 13882.3MB/s
Better... 6273951764 -> 949146808 [15.13%]; 547ms, 10938.4MB/s
Best... 6273951764 -> 845168908 [13.47%]; 8.878s, 673.9MB/s
Best... 6273951764 -> 832855506 [13.27%]; 9.455s, 632.8MB/s
* nyc-taxi-data-10M.csv
Default... 3325605752 -> 1095998837 [32.96%]; 324ms, 9788.7MB/s
Better... 3325605752 -> 954776589 [28.71%]; 491ms, 6459.4MB/s
Best... 3325605752 -> 786648492 [23.65%]; 7.628s, 415.8MB/s
Best... 3325605752 -> 779098746 [23.43%]; 8.29s, 382.6MB/s
* 10gb.tar
Default... 10065157632 -> 5916578242 [58.78%]; 1.028s, 9337.4MB/s
Better... 10065157632 -> 5649207485 [56.13%]; 1.597s, 6010.6MB/s
Best... 10065157632 -> 5215462149 [51.82%]; 29.977s, 320.2MB/s
Best... 10065157632 -> 5208719802 [51.75%]; 32.78s, 292.8MB/
* consensus.db.10gb
Default... 10737418240 -> 4562648848 [42.49%]; 882ms, 11610.0MB/s
Better... 10737418240 -> 4542428129 [42.30%]; 1.533s, 6679.7MB/s
Best... 10737418240 -> 4280128613 [39.86%]; 41.758s, 245.2MB/s
Best... 10737418240 -> 4244773384 [39.53%]; 42.96s, 238.4MB/s
```

Decompression speed should be around the same as using the 'better' compression mode.
Expand Down
65 changes: 41 additions & 24 deletions s2/encode_best.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
offset int
s int
length int
score int
rep bool
}
var best match
Expand All @@ -82,6 +83,20 @@ func encodeBlockBest(dst, src []byte) (d int) {
candidateL := lTable[hashL]
candidateS := sTable[hashS]

score := func(m match) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - m.s
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}
offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
}
return score - emitCopySize(offset, m.length)
}

matchAt := func(offset, s int, first uint32, rep bool) match {
if best.length != 0 && best.s-best.offset == s-offset {
// Don't retest if we have the same offset.
Expand All @@ -101,41 +116,26 @@ func encodeBlockBest(dst, src []byte) (d int) {
m.length += 8
}
m.length -= offset
return m
}
score := func(m match, otherS int) int {
// Matches that are longer forward are penalized since we must emit it as a literal.
score := m.length - (m.s - otherS)
if nextEmit == m.s {
// If we do not have to emit literals, we save 1 byte
score++
}
offset := m.s - m.offset
if m.rep {
return score - emitRepeatSize(offset, m.length)
m.score = score(m)
if m.score <= -m.s {
// Eliminate if no savings, we might find a better one.
m.length = 0
}
return score - emitCopySize(offset, m.length)
return m
}

bestOf := func(a, b match) match {
if b.length == 0 {
return a
}
if a.length == 0 {
return b
}
as := score(a, b.s)
bs := score(b, a.s)
as := a.score + b.s
bs := b.score + a.s
if as >= bs {
if as <= 0 {
// Eliminate if no savings, we might find a better one.
a.length = 0
}
return a
}
if bs <= 0 {
// Eliminate if no savings, we might find a better one.
b.length = 0
}
return b
}

Expand All @@ -159,7 +159,7 @@ func encodeBlockBest(dst, src []byte) (d int) {
best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))

// s+2
if best.length < 100 {
if true {
nextShort = sTable[hash4(cv>>8, sTableBits)]
s++
cv = load64(src, s)
Expand All @@ -169,6 +169,23 @@ func encodeBlockBest(dst, src []byte) (d int) {
best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
}
// Search for a match at best match end, see if that is better.
if sAt := best.s + best.length; sAt < sLimit {
sBack := best.s
backL := best.length
// Load initial values
cv = load64(src, sBack)
// Search for mismatch
next := lTable[hash8(load64(src, sAt), lTableBits)]
//next := sTable[hash4(load64(src, sAt), sTableBits)]

if checkAt := getCur(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
if checkAt := getPrev(next) - backL; checkAt > 0 {
best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
}
}
}
}

Expand Down

0 comments on commit e3ae23b

Please sign in to comment.