Skip to content

Commit 93b05d6

Browse files
authored
zstd: Improve best compression (#360)
* zstd: Improve best compression See if we can find a better match by checking where the current best ends. Use that offset to see if we can find a better full match. Before/after. Ignore speed, may not be comparable: ``` gob-stream zskp 4 1911399616 171537212 32113 56.76 gob-stream zskp 4 1911399616 167273881 29337 62.13 enwik9 zskp 4 1000000000 276609671 44029 21.66 enwik9 zskp 4 1000000000 275241169 36430 26.18 github-june-2days-2019.json zskp 4 6273951764 512796117 97791 61.18 github-june-2days-2019.json zskp 4 6273951764 503314661 93811 63.78 nyc-taxi-data-10M.csv zskp 4 3325605752 495986829 89368 35.49 nyc-taxi-data-10M.csv zskp 4 3325605752 490907191 65939 48.10 ```
1 parent 6522991 commit 93b05d6

File tree

2 files changed

+20
-6
lines changed

2 files changed

+20
-6
lines changed

zstd/README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ file out level insize outsize millis mb/s
153153
silesia.tar zskp 1 211947520 73101992 643 313.87
154154
silesia.tar zskp 2 211947520 67504318 969 208.38
155155
silesia.tar zskp 3 211947520 65177448 1899 106.44
156-
silesia.tar zskp 4 211947520 61381950 8115 24.91
156+
silesia.tar zskp 4 211947520 60995370 7691 26.28
157157
158158
cgo zstd:
159159
silesia.tar zstd 1 211947520 73605392 543 371.56
@@ -172,7 +172,7 @@ file out level insize outsize millis mb/s
172172
gob-stream zskp 1 1911399616 235022249 3088 590.30
173173
gob-stream zskp 2 1911399616 205669791 3786 481.34
174174
gob-stream zskp 3 1911399616 185792019 9324 195.48
175-
gob-stream zskp 4 1911399616 171537212 32113 56.76
175+
gob-stream zskp 4 1911399616 167273881 29337 62.13
176176
gob-stream zstd 1 1911399616 249810424 2637 691.26
177177
gob-stream zstd 3 1911399616 208192146 3490 522.31
178178
gob-stream zstd 6 1911399616 193632038 6687 272.56
@@ -188,7 +188,7 @@ file out level insize outsize millis mb/s
188188
enwik9 zskp 1 1000000000 343848582 3609 264.18
189189
enwik9 zskp 2 1000000000 317276632 5746 165.97
190190
enwik9 zskp 3 1000000000 294540704 11725 81.34
191-
enwik9 zskp 4 1000000000 276609671 44029 21.66
191+
enwik9 zskp 4 1000000000 275241169 36430 26.18
192192
enwik9 zstd 1 1000000000 358072021 3110 306.65
193193
enwik9 zstd 3 1000000000 313734672 4784 199.35
194194
enwik9 zstd 6 1000000000 295138875 10290 92.68
@@ -203,7 +203,7 @@ file out level insize outsize millis mb/s
203203
github-june-2days-2019.json zskp 1 6273951764 699045015 10620 563.40
204204
github-june-2days-2019.json zskp 2 6273951764 617881763 11687 511.96
205205
github-june-2days-2019.json zskp 3 6273951764 537511906 29252 204.54
206-
github-june-2days-2019.json zskp 4 6273951764 512796117 97791 61.18
206+
github-june-2days-2019.json zskp 4 6273951764 503314661 93811 63.78
207207
github-june-2days-2019.json zstd 1 6273951764 766284037 8450 708.00
208208
github-june-2days-2019.json zstd 3 6273951764 661889476 10927 547.57
209209
github-june-2days-2019.json zstd 6 6273951764 642756859 22996 260.18
@@ -218,7 +218,7 @@ file out level insize outsize millis mb/s
218218
rawstudio-mint14.tar zskp 1 8558382592 3667489370 20210 403.84
219219
rawstudio-mint14.tar zskp 2 8558382592 3364592300 31873 256.07
220220
rawstudio-mint14.tar zskp 3 8558382592 3224594213 71751 113.75
221-
rawstudio-mint14.tar zskp 4 8558382592 3027332295 486243 16.79
221+
rawstudio-mint14.tar zskp 4 8558382592 3020370044 404956 20.16
222222
rawstudio-mint14.tar zstd 1 8558382592 3609250104 17136 476.27
223223
rawstudio-mint14.tar zstd 3 8558382592 3341679997 29262 278.92
224224
rawstudio-mint14.tar zstd 6 8558382592 3235846406 77904 104.77
@@ -233,7 +233,7 @@ file out level insize outsize millis mb/s
233233
nyc-taxi-data-10M.csv zskp 1 3325605752 641339945 8925 355.35
234234
nyc-taxi-data-10M.csv zskp 2 3325605752 591748091 11268 281.44
235235
nyc-taxi-data-10M.csv zskp 3 3325605752 538490114 19880 159.53
236-
nyc-taxi-data-10M.csv zskp 4 3325605752 495986829 89368 35.49
236+
nyc-taxi-data-10M.csv zskp 4 3325605752 490907191 65939 48.10
237237
nyc-taxi-data-10M.csv zstd 1 3325605752 687399637 8233 385.18
238238
nyc-taxi-data-10M.csv zstd 3 3325605752 598514411 10065 315.07
239239
nyc-taxi-data-10M.csv zstd 6 3325605752 570522953 20038 158.27

zstd/enc_best.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,20 @@ encodeLoop:
220220
best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
221221
best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
222222
best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
223+
224+
// See if we can find a better match by checking where the current best ends.
225+
// Use that offset to see if we can find a better full match.
226+
if sAt := best.s + best.length; sAt < sLimit {
227+
nextHashL := hash8(load6432(src, sAt), bestLongTableBits)
228+
candidateEnd := e.longTable[nextHashL]
229+
if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
230+
bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
231+
if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
232+
bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
233+
}
234+
best = bestEnd
235+
}
236+
}
223237
}
224238

225239
// We have a match, we can store the forward value

0 commit comments

Comments
 (0)