From 2bb69be191060caa8faf032ba91006f16d65da4b Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 24 Mar 2021 17:22:49 +0100 Subject: [PATCH] zstd: Big speedup on small dictionary encodes (#345) All credit goes to @tony2001 As shown in #344 the speed of small dictionary compression tasks (< 32K) can be improved significantly by keeping track of the state of the hash table. This effectively implements #344 but avoids a penalty for non-dictionary encodes and extends the functionality to the "better" compression mode as well. This change will also make it easier to [remove the copy of the literal dictionary](https://github.com/klauspost/compress/blob/a0dc84a8cf242dde7e21f8aba26126ca4621ff8c/zstd/enc_base.go#L171) every time an encode starts and have specialized code to deal with this. ``` benchmark old ns/op new ns/op delta BenchmarkEncodeAllDict0_1024/length-19-level-fastest-dict-1-32 5729 870 -84.82% BenchmarkEncodeAllDict0_1024/length-19-level-default-dict-1-32 59694 2115 -96.46% BenchmarkEncodeAllDict0_1024/length-19-level-better-dict-1-32 197183 2454 -98.76% BenchmarkEncodeAllDict0_1024/length-5-level-fastest-dict-1-32 5596 600 -89.28% BenchmarkEncodeAllDict0_1024/length-5-level-default-dict-1-32 59342 1222 -97.94% BenchmarkEncodeAllDict0_1024/length-5-level-better-dict-1-32 194466 1958 -98.99% BenchmarkEncodeAllDict0_1024/length-659-level-fastest-dict-1-32 13343 13132 -1.58% BenchmarkEncodeAllDict0_1024/length-659-level-default-dict-1-32 72651 33988 -53.22% BenchmarkEncodeAllDict0_1024/length-659-level-better-dict-1-32 211509 22635 -89.30% BenchmarkEncodeAllDict0_1024/length-174-level-fastest-dict-1-32 12190 10318 -15.36% BenchmarkEncodeAllDict0_1024/length-174-level-default-dict-1-32 71443 28580 -60.00% BenchmarkEncodeAllDict0_1024/length-174-level-better-dict-1-32 213304 17914 -91.60% BenchmarkEncodeAllDict0_1024/length-5-level-fastest-dict-1#01-32 5582 595 -89.33% BenchmarkEncodeAllDict0_1024/length-5-level-default-dict-1#01-32 58721 1221 -97.92% BenchmarkEncodeAllDict0_1024/length-5-level-better-dict-1#01-32 196875 1963 -99.00% BenchmarkEncodeAllDict0_1024/length-659-level-fastest-dict-1#01-32 13260 13132 -0.97% BenchmarkEncodeAllDict0_1024/length-659-level-default-dict-1#01-32 71944 33896 -52.89% BenchmarkEncodeAllDict0_1024/length-659-level-better-dict-1#01-32 207200 22533 -89.12% BenchmarkEncodeAllDict0_1024/length-174-level-fastest-dict-1#01-32 12218 10295 -15.74% BenchmarkEncodeAllDict0_1024/length-174-level-default-dict-1#01-32 69490 28531 -58.94% BenchmarkEncodeAllDict0_1024/length-174-level-better-dict-1#01-32 205039 18020 -91.21% BenchmarkEncodeAllDict0_1024/length-5-level-fastest-dict-1#02-32 5579 599 -89.26% BenchmarkEncodeAllDict0_1024/length-5-level-default-dict-1#02-32 60810 1228 -97.98% BenchmarkEncodeAllDict0_1024/length-5-level-better-dict-1#02-32 198740 1953 -99.02% BenchmarkEncodeAllDict0_1024/length-659-level-fastest-dict-1#02-32 13352 13128 -1.68% BenchmarkEncodeAllDict0_1024/length-659-level-default-dict-1#02-32 72544 33887 -53.29% BenchmarkEncodeAllDict0_1024/length-659-level-better-dict-1#02-32 213331 22516 -89.45% BenchmarkEncodeAllDict0_1024/length-174-level-fastest-dict-1#02-32 12204 10299 -15.61% BenchmarkEncodeAllDict0_1024/length-174-level-default-dict-1#02-32 69317 28421 -59.00% BenchmarkEncodeAllDict0_1024/length-174-level-better-dict-1#02-32 207613 17917 -91.37% BenchmarkEncodeAllDict0_1024/length-5-level-fastest-dict-1#03-32 5542 600 -89.17% BenchmarkEncodeAllDict0_1024/length-5-level-default-dict-1#03-32 59132 1218 -97.94% BenchmarkEncodeAllDict0_1024/length-5-level-better-dict-1#03-32 196451 1952 -99.01% BenchmarkEncodeAllDict0_1024/length-659-level-fastest-dict-1#03-32 13319 13112 -1.55% BenchmarkEncodeAllDict0_1024/length-659-level-default-dict-1#03-32 70234 33843 -51.81% BenchmarkEncodeAllDict0_1024/length-659-level-better-dict-1#03-32 209384 22447 -89.28% BenchmarkEncodeAllDict0_1024/length-174-level-fastest-dict-1#03-32 12285 10297 -16.18% BenchmarkEncodeAllDict0_1024/length-174-level-default-dict-1#03-32 71972 28585 -60.28% BenchmarkEncodeAllDict0_1024/length-174-level-better-dict-1#03-32 215483 17902 -91.69% BenchmarkEncodeAllDict1024_8192/length-1076-level-fastest-dict-1-32 16508 16221 -1.74% BenchmarkEncodeAllDict1024_8192/length-1076-level-default-dict-1-32 83569 41344 -50.53% BenchmarkEncodeAllDict1024_8192/length-1076-level-better-dict-1-32 220306 39384 -82.12% BenchmarkEncodeAllDict1024_8192/length-5872-level-fastest-dict-1-32 41125 40975 -0.36% BenchmarkEncodeAllDict1024_8192/length-5872-level-default-dict-1-32 163203 77122 -52.74% BenchmarkEncodeAllDict1024_8192/length-5872-level-better-dict-1-32 318789 137116 -56.99% BenchmarkEncodeAllDict1024_8192/length-1076-level-fastest-dict-1#01-32 16586 16294 -1.76% BenchmarkEncodeAllDict1024_8192/length-1076-level-default-dict-1#01-32 82607 41120 -50.22% BenchmarkEncodeAllDict1024_8192/length-1076-level-better-dict-1#01-32 219278 39179 -82.13% BenchmarkEncodeAllDict1024_8192/length-5872-level-fastest-dict-1#01-32 42267 41093 -2.78% BenchmarkEncodeAllDict1024_8192/length-5872-level-default-dict-1#01-32 164353 76905 -53.21% BenchmarkEncodeAllDict1024_8192/length-5872-level-better-dict-1#01-32 327857 136501 -58.37% BenchmarkEncodeAllDict1024_8192/length-1076-level-fastest-dict-1#02-32 16554 16177 -2.28% BenchmarkEncodeAllDict1024_8192/length-1076-level-default-dict-1#02-32 83337 41239 -50.52% BenchmarkEncodeAllDict1024_8192/length-1076-level-better-dict-1#02-32 226392 39385 -82.60% BenchmarkEncodeAllDict1024_8192/length-5872-level-fastest-dict-1#02-32 41175 40834 -0.83% BenchmarkEncodeAllDict1024_8192/length-5872-level-default-dict-1#02-32 160614 77318 -51.86% BenchmarkEncodeAllDict1024_8192/length-5872-level-better-dict-1#02-32 313359 136739 -56.36% BenchmarkEncodeAllDict1024_8192/length-1076-level-fastest-dict-1#03-32 16413 16274 -0.85% BenchmarkEncodeAllDict1024_8192/length-1076-level-default-dict-1#03-32 81907 41151 -49.76% BenchmarkEncodeAllDict1024_8192/length-1076-level-better-dict-1#03-32 222585 39181 -82.40% BenchmarkEncodeAllDict1024_8192/length-5872-level-fastest-dict-1#03-32 41232 40978 -0.62% BenchmarkEncodeAllDict1024_8192/length-5872-level-default-dict-1#03-32 159086 77235 -51.45% BenchmarkEncodeAllDict1024_8192/length-5872-level-better-dict-1#03-32 309822 136600 -55.91% BenchmarkEncodeAllDict8192_16384/length-12131-level-fastest-dict-1-32 55120 55056 -0.12% BenchmarkEncodeAllDict8192_16384/length-12131-level-default-dict-1-32 291966 132353 -54.67% BenchmarkEncodeAllDict8192_16384/length-12131-level-better-dict-1-32 467914 206802 -55.80% BenchmarkEncodeAllDict8192_16384/length-12131-level-fastest-dict-1#01-32 53770 54785 +1.89% BenchmarkEncodeAllDict8192_16384/length-12131-level-default-dict-1#01-32 291053 130230 -55.26% BenchmarkEncodeAllDict8192_16384/length-12131-level-better-dict-1#01-32 476829 205292 -56.95% BenchmarkEncodeAllDict8192_16384/length-9024-level-fastest-dict-1-32 31805 31891 +0.27% BenchmarkEncodeAllDict8192_16384/length-9024-level-default-dict-1-32 116904 61027 -47.80% BenchmarkEncodeAllDict8192_16384/length-9024-level-better-dict-1-32 260057 95128 -63.42% BenchmarkEncodeAllDict8192_16384/length-12131-level-fastest-dict-1#02-32 54833 54341 -0.90% BenchmarkEncodeAllDict8192_16384/length-12131-level-default-dict-1#02-32 291523 131595 -54.86% BenchmarkEncodeAllDict8192_16384/length-12131-level-better-dict-1#02-32 467178 206408 -55.82% BenchmarkEncodeAllDict8192_16384/length-12131-level-fastest-dict-1#03-32 54431 54289 -0.26% BenchmarkEncodeAllDict8192_16384/length-12131-level-default-dict-1#03-32 291092 130441 -55.19% BenchmarkEncodeAllDict8192_16384/length-12131-level-better-dict-1#03-32 476490 205606 -56.85% BenchmarkEncodeAllDict16384_65536/length-59695-level-fastest-dict-1-32 245211 243965 -0.51% BenchmarkEncodeAllDict16384_65536/length-59695-level-default-dict-1-32 817566 822310 +0.58% BenchmarkEncodeAllDict16384_65536/length-59695-level-better-dict-1-32 1258889 590281 -53.11% BenchmarkEncodeAllDict16384_65536/length-59695-level-fastest-dict-1#01-32 242203 241662 -0.22% BenchmarkEncodeAllDict16384_65536/length-59695-level-default-dict-1#01-32 812895 818005 +0.63% BenchmarkEncodeAllDict16384_65536/length-59695-level-better-dict-1#01-32 1265187 590826 -53.30% BenchmarkEncodeAllDict16384_65536/length-59695-level-fastest-dict-1#02-32 242602 241849 -0.31% BenchmarkEncodeAllDict16384_65536/length-59695-level-default-dict-1#02-32 828540 819250 -1.12% BenchmarkEncodeAllDict16384_65536/length-59695-level-better-dict-1#02-32 1286233 586918 -54.37% BenchmarkEncodeAllDict16384_65536/length-59695-level-fastest-dict-1#03-32 245593 244559 -0.42% BenchmarkEncodeAllDict16384_65536/length-59695-level-default-dict-1#03-32 813931 819203 +0.65% BenchmarkEncodeAllDict16384_65536/length-59695-level-better-dict-1#03-32 1272813 581714 -54.30% BenchmarkEncodeAllDict16384_65536/length-20000-level-fastest-dict-1-32 18972 18733 -1.26% BenchmarkEncodeAllDict16384_65536/length-20000-level-default-dict-1-32 75984 39850 -47.55% BenchmarkEncodeAllDict16384_65536/length-20000-level-better-dict-1-32 213173 27825 -86.95% BenchmarkEncodeAllDict65536_0/length-210569-level-fastest-dict-1-32 1070089 1055243 -1.39% BenchmarkEncodeAllDict65536_0/length-210569-level-default-dict-1-32 1780011 1819554 +2.22% BenchmarkEncodeAllDict65536_0/length-210569-level-better-dict-1-32 2785437 1631976 -41.41% BenchmarkEncodeAllDict65536_0/length-102605-level-fastest-dict-1-32 500568 499781 -0.16% BenchmarkEncodeAllDict65536_0/length-102605-level-default-dict-1-32 1036024 1076927 +3.95% BenchmarkEncodeAllDict65536_0/length-102605-level-better-dict-1-32 1740181 859317 -50.62% BenchmarkEncodeAllDict65536_0/length-68013-level-fastest-dict-1-32 410671 405122 -1.35% BenchmarkEncodeAllDict65536_0/length-68013-level-default-dict-1-32 1025429 1025611 +0.02% BenchmarkEncodeAllDict65536_0/length-68013-level-better-dict-1-32 1584230 739134 -53.34% BenchmarkEncodeAllDict65536_0/length-210569-level-fastest-dict-1#01-32 1054258 1048012 -0.59% BenchmarkEncodeAllDict65536_0/length-210569-level-default-dict-1#01-32 1756825 1810346 +3.05% BenchmarkEncodeAllDict65536_0/length-210569-level-better-dict-1#01-32 2816869 1659755 -41.08% BenchmarkEncodeAllDict65536_0/length-102605-level-fastest-dict-1#01-32 498201 500382 +0.44% BenchmarkEncodeAllDict65536_0/length-102605-level-default-dict-1#01-32 1045296 1075033 +2.84% BenchmarkEncodeAllDict65536_0/length-102605-level-better-dict-1#01-32 1772563 855280 -51.75% BenchmarkEncodeAllDict65536_0/length-68013-level-fastest-dict-1#01-32 411487 404032 -1.81% BenchmarkEncodeAllDict65536_0/length-68013-level-default-dict-1#01-32 1009682 1023147 +1.33% BenchmarkEncodeAllDict65536_0/length-68013-level-better-dict-1#01-32 1588776 728182 -54.17% BenchmarkEncodeAllDict65536_0/length-102605-level-fastest-dict-1#02-32 501487 498564 -0.58% BenchmarkEncodeAllDict65536_0/length-102605-level-default-dict-1#02-32 1037744 1074253 +3.52% BenchmarkEncodeAllDict65536_0/length-102605-level-better-dict-1#02-32 1753509 859959 -50.96% BenchmarkEncodeAllDict65536_0/length-68013-level-fastest-dict-1#02-32 407233 403579 -0.90% BenchmarkEncodeAllDict65536_0/length-68013-level-default-dict-1#02-32 1013906 1026835 +1.28% BenchmarkEncodeAllDict65536_0/length-68013-level-better-dict-1#02-32 1591512 731027 -54.07% BenchmarkEncodeAllDict65536_0/length-102605-level-fastest-dict-1#03-32 500983 495842 -1.03% BenchmarkEncodeAllDict65536_0/length-102605-level-default-dict-1#03-32 1046435 1075070 +2.74% BenchmarkEncodeAllDict65536_0/length-102605-level-better-dict-1#03-32 1760434 860257 -51.13% BenchmarkEncodeAllDict65536_0/length-68013-level-fastest-dict-1#03-32 409099 405108 -0.98% BenchmarkEncodeAllDict65536_0/length-68013-level-default-dict-1#03-32 1011372 1021036 +0.96% BenchmarkEncodeAllDict65536_0/length-68013-level-better-dict-1#03-32 1572944 731780 -53.48% benchmark old MB/s new MB/s speedup BenchmarkEncodeAllDict0_1024/length-19-level-fastest-dict-1-32 3.32 21.84 6.58x BenchmarkEncodeAllDict0_1024/length-19-level-default-dict-1-32 0.32 8.98 28.06x BenchmarkEncodeAllDict0_1024/length-19-level-better-dict-1-32 0.10 7.74 77.40x BenchmarkEncodeAllDict0_1024/length-5-level-fastest-dict-1-32 0.89 8.34 9.37x BenchmarkEncodeAllDict0_1024/length-5-level-default-dict-1-32 0.08 4.09 51.12x BenchmarkEncodeAllDict0_1024/length-5-level-better-dict-1-32 0.03 2.55 85.00x BenchmarkEncodeAllDict0_1024/length-659-level-fastest-dict-1-32 49.39 50.18 1.02x BenchmarkEncodeAllDict0_1024/length-659-level-default-dict-1-32 9.07 19.39 2.14x BenchmarkEncodeAllDict0_1024/length-659-level-better-dict-1-32 3.12 29.11 9.33x BenchmarkEncodeAllDict0_1024/length-174-level-fastest-dict-1-32 14.27 16.86 1.18x BenchmarkEncodeAllDict0_1024/length-174-level-default-dict-1-32 2.44 6.09 2.50x BenchmarkEncodeAllDict0_1024/length-174-level-better-dict-1-32 0.82 9.71 11.84x BenchmarkEncodeAllDict0_1024/length-5-level-fastest-dict-1#01-32 0.90 8.40 9.33x BenchmarkEncodeAllDict0_1024/length-5-level-default-dict-1#01-32 0.09 4.10 45.56x BenchmarkEncodeAllDict0_1024/length-5-level-better-dict-1#01-32 0.03 2.55 85.00x BenchmarkEncodeAllDict0_1024/length-659-level-fastest-dict-1#01-32 49.70 50.18 1.01x BenchmarkEncodeAllDict0_1024/length-659-level-default-dict-1#01-32 9.16 19.44 2.12x BenchmarkEncodeAllDict0_1024/length-659-level-better-dict-1#01-32 3.18 29.25 9.20x BenchmarkEncodeAllDict0_1024/length-174-level-fastest-dict-1#01-32 14.24 16.90 1.19x BenchmarkEncodeAllDict0_1024/length-174-level-default-dict-1#01-32 2.50 6.10 2.44x BenchmarkEncodeAllDict0_1024/length-174-level-better-dict-1#01-32 0.85 9.66 11.36x BenchmarkEncodeAllDict0_1024/length-5-level-fastest-dict-1#02-32 0.90 8.35 9.28x BenchmarkEncodeAllDict0_1024/length-5-level-default-dict-1#02-32 0.08 4.07 50.88x BenchmarkEncodeAllDict0_1024/length-5-level-better-dict-1#02-32 0.03 2.56 85.33x BenchmarkEncodeAllDict0_1024/length-659-level-fastest-dict-1#02-32 49.36 50.20 1.02x BenchmarkEncodeAllDict0_1024/length-659-level-default-dict-1#02-32 9.08 19.45 2.14x BenchmarkEncodeAllDict0_1024/length-659-level-better-dict-1#02-32 3.09 29.27 9.47x BenchmarkEncodeAllDict0_1024/length-174-level-fastest-dict-1#02-32 14.26 16.90 1.19x BenchmarkEncodeAllDict0_1024/length-174-level-default-dict-1#02-32 2.51 6.12 2.44x BenchmarkEncodeAllDict0_1024/length-174-level-better-dict-1#02-32 0.84 9.71 11.56x BenchmarkEncodeAllDict0_1024/length-5-level-fastest-dict-1#03-32 0.90 8.33 9.26x BenchmarkEncodeAllDict0_1024/length-5-level-default-dict-1#03-32 0.08 4.11 51.38x BenchmarkEncodeAllDict0_1024/length-5-level-better-dict-1#03-32 0.03 2.56 85.33x BenchmarkEncodeAllDict0_1024/length-659-level-fastest-dict-1#03-32 49.48 50.26 1.02x BenchmarkEncodeAllDict0_1024/length-659-level-default-dict-1#03-32 9.38 19.47 2.08x BenchmarkEncodeAllDict0_1024/length-659-level-better-dict-1#03-32 3.15 29.36 9.32x BenchmarkEncodeAllDict0_1024/length-174-level-fastest-dict-1#03-32 14.16 16.90 1.19x BenchmarkEncodeAllDict0_1024/length-174-level-default-dict-1#03-32 2.42 6.09 2.52x BenchmarkEncodeAllDict0_1024/length-174-level-better-dict-1#03-32 0.81 9.72 12.00x BenchmarkEncodeAllDict1024_8192/length-1076-level-fastest-dict-1-32 65.18 66.33 1.02x BenchmarkEncodeAllDict1024_8192/length-1076-level-default-dict-1-32 12.88 26.03 2.02x BenchmarkEncodeAllDict1024_8192/length-1076-level-better-dict-1-32 4.88 27.32 5.60x BenchmarkEncodeAllDict1024_8192/length-5872-level-fastest-dict-1-32 142.78 143.31 1.00x BenchmarkEncodeAllDict1024_8192/length-5872-level-default-dict-1-32 35.98 76.14 2.12x BenchmarkEncodeAllDict1024_8192/length-5872-level-better-dict-1-32 18.42 42.82 2.32x BenchmarkEncodeAllDict1024_8192/length-1076-level-fastest-dict-1#01-32 64.88 66.04 1.02x BenchmarkEncodeAllDict1024_8192/length-1076-level-default-dict-1#01-32 13.03 26.17 2.01x BenchmarkEncodeAllDict1024_8192/length-1076-level-better-dict-1#01-32 4.91 27.46 5.59x BenchmarkEncodeAllDict1024_8192/length-5872-level-fastest-dict-1#01-32 138.93 142.90 1.03x BenchmarkEncodeAllDict1024_8192/length-5872-level-default-dict-1#01-32 35.73 76.35 2.14x BenchmarkEncodeAllDict1024_8192/length-5872-level-better-dict-1#01-32 17.91 43.02 2.40x BenchmarkEncodeAllDict1024_8192/length-1076-level-fastest-dict-1#02-32 65.00 66.51 1.02x BenchmarkEncodeAllDict1024_8192/length-1076-level-default-dict-1#02-32 12.91 26.09 2.02x BenchmarkEncodeAllDict1024_8192/length-1076-level-better-dict-1#02-32 4.75 27.32 5.75x BenchmarkEncodeAllDict1024_8192/length-5872-level-fastest-dict-1#02-32 142.61 143.80 1.01x BenchmarkEncodeAllDict1024_8192/length-5872-level-default-dict-1#02-32 36.56 75.95 2.08x BenchmarkEncodeAllDict1024_8192/length-5872-level-better-dict-1#02-32 18.74 42.94 2.29x BenchmarkEncodeAllDict1024_8192/length-1076-level-fastest-dict-1#03-32 65.56 66.12 1.01x BenchmarkEncodeAllDict1024_8192/length-1076-level-default-dict-1#03-32 13.14 26.15 1.99x BenchmarkEncodeAllDict1024_8192/length-1076-level-better-dict-1#03-32 4.83 27.46 5.69x BenchmarkEncodeAllDict1024_8192/length-5872-level-fastest-dict-1#03-32 142.41 143.30 1.01x BenchmarkEncodeAllDict1024_8192/length-5872-level-default-dict-1#03-32 36.91 76.03 2.06x BenchmarkEncodeAllDict1024_8192/length-5872-level-better-dict-1#03-32 18.95 42.99 2.27x BenchmarkEncodeAllDict8192_16384/length-12131-level-fastest-dict-1-32 220.08 220.34 1.00x BenchmarkEncodeAllDict8192_16384/length-12131-level-default-dict-1-32 41.55 91.66 2.21x BenchmarkEncodeAllDict8192_16384/length-12131-level-better-dict-1-32 25.93 58.66 2.26x BenchmarkEncodeAllDict8192_16384/length-12131-level-fastest-dict-1#01-32 225.61 221.43 0.98x BenchmarkEncodeAllDict8192_16384/length-12131-level-default-dict-1#01-32 41.68 93.15 2.23x BenchmarkEncodeAllDict8192_16384/length-12131-level-better-dict-1#01-32 25.44 59.09 2.32x BenchmarkEncodeAllDict8192_16384/length-9024-level-fastest-dict-1-32 283.73 282.97 1.00x BenchmarkEncodeAllDict8192_16384/length-9024-level-default-dict-1-32 77.19 147.87 1.92x BenchmarkEncodeAllDict8192_16384/length-9024-level-better-dict-1-32 34.70 94.86 2.73x BenchmarkEncodeAllDict8192_16384/length-12131-level-fastest-dict-1#02-32 221.23 223.24 1.01x BenchmarkEncodeAllDict8192_16384/length-12131-level-default-dict-1#02-32 41.61 92.18 2.22x BenchmarkEncodeAllDict8192_16384/length-12131-level-better-dict-1#02-32 25.97 58.77 2.26x BenchmarkEncodeAllDict8192_16384/length-12131-level-fastest-dict-1#03-32 222.87 223.45 1.00x BenchmarkEncodeAllDict8192_16384/length-12131-level-default-dict-1#03-32 41.67 93.00 2.23x BenchmarkEncodeAllDict8192_16384/length-12131-level-better-dict-1#03-32 25.46 59.00 2.32x BenchmarkEncodeAllDict16384_65536/length-59695-level-fastest-dict-1-32 243.44 244.69 1.01x BenchmarkEncodeAllDict16384_65536/length-59695-level-default-dict-1-32 73.02 72.59 0.99x BenchmarkEncodeAllDict16384_65536/length-59695-level-better-dict-1-32 47.42 101.13 2.13x BenchmarkEncodeAllDict16384_65536/length-59695-level-fastest-dict-1#01-32 246.47 247.02 1.00x BenchmarkEncodeAllDict16384_65536/length-59695-level-default-dict-1#01-32 73.44 72.98 0.99x BenchmarkEncodeAllDict16384_65536/length-59695-level-better-dict-1#01-32 47.18 101.04 2.14x BenchmarkEncodeAllDict16384_65536/length-59695-level-fastest-dict-1#02-32 246.06 246.83 1.00x BenchmarkEncodeAllDict16384_65536/length-59695-level-default-dict-1#02-32 72.05 72.87 1.01x BenchmarkEncodeAllDict16384_65536/length-59695-level-better-dict-1#02-32 46.41 101.71 2.19x BenchmarkEncodeAllDict16384_65536/length-59695-level-fastest-dict-1#03-32 243.06 244.09 1.00x BenchmarkEncodeAllDict16384_65536/length-59695-level-default-dict-1#03-32 73.34 72.87 0.99x BenchmarkEncodeAllDict16384_65536/length-59695-level-better-dict-1#03-32 46.90 102.62 2.19x BenchmarkEncodeAllDict16384_65536/length-20000-level-fastest-dict-1-32 1054.19 1067.64 1.01x BenchmarkEncodeAllDict16384_65536/length-20000-level-default-dict-1-32 263.21 501.88 1.91x BenchmarkEncodeAllDict16384_65536/length-20000-level-better-dict-1-32 93.82 718.77 7.66x BenchmarkEncodeAllDict65536_0/length-210569-level-fastest-dict-1-32 196.78 199.55 1.01x BenchmarkEncodeAllDict65536_0/length-210569-level-default-dict-1-32 118.30 115.73 0.98x BenchmarkEncodeAllDict65536_0/length-210569-level-better-dict-1-32 75.60 129.03 1.71x BenchmarkEncodeAllDict65536_0/length-102605-level-fastest-dict-1-32 204.98 205.30 1.00x BenchmarkEncodeAllDict65536_0/length-102605-level-default-dict-1-32 99.04 95.28 0.96x BenchmarkEncodeAllDict65536_0/length-102605-level-better-dict-1-32 58.96 119.40 2.03x BenchmarkEncodeAllDict65536_0/length-68013-level-fastest-dict-1-32 165.61 167.88 1.01x BenchmarkEncodeAllDict65536_0/length-68013-level-default-dict-1-32 66.33 66.31 1.00x BenchmarkEncodeAllDict65536_0/length-68013-level-better-dict-1-32 42.93 92.02 2.14x BenchmarkEncodeAllDict65536_0/length-210569-level-fastest-dict-1#01-32 199.73 200.92 1.01x BenchmarkEncodeAllDict65536_0/length-210569-level-default-dict-1#01-32 119.86 116.31 0.97x BenchmarkEncodeAllDict65536_0/length-210569-level-better-dict-1#01-32 74.75 126.87 1.70x BenchmarkEncodeAllDict65536_0/length-102605-level-fastest-dict-1#01-32 205.95 205.05 1.00x BenchmarkEncodeAllDict65536_0/length-102605-level-default-dict-1#01-32 98.16 95.44 0.97x BenchmarkEncodeAllDict65536_0/length-102605-level-better-dict-1#01-32 57.89 119.97 2.07x BenchmarkEncodeAllDict65536_0/length-68013-level-fastest-dict-1#01-32 165.29 168.34 1.02x BenchmarkEncodeAllDict65536_0/length-68013-level-default-dict-1#01-32 67.36 66.47 0.99x BenchmarkEncodeAllDict65536_0/length-68013-level-better-dict-1#01-32 42.81 93.40 2.18x BenchmarkEncodeAllDict65536_0/length-102605-level-fastest-dict-1#02-32 204.60 205.80 1.01x BenchmarkEncodeAllDict65536_0/length-102605-level-default-dict-1#02-32 98.87 95.51 0.97x BenchmarkEncodeAllDict65536_0/length-102605-level-better-dict-1#02-32 58.51 119.31 2.04x BenchmarkEncodeAllDict65536_0/length-68013-level-fastest-dict-1#02-32 167.01 168.52 1.01x BenchmarkEncodeAllDict65536_0/length-68013-level-default-dict-1#02-32 67.08 66.24 0.99x BenchmarkEncodeAllDict65536_0/length-68013-level-better-dict-1#02-32 42.73 93.04 2.18x BenchmarkEncodeAllDict65536_0/length-102605-level-fastest-dict-1#03-32 204.81 206.93 1.01x BenchmarkEncodeAllDict65536_0/length-102605-level-default-dict-1#03-32 98.05 95.44 0.97x BenchmarkEncodeAllDict65536_0/length-102605-level-better-dict-1#03-32 58.28 119.27 2.05x BenchmarkEncodeAllDict65536_0/length-68013-level-fastest-dict-1#03-32 166.25 167.89 1.01x BenchmarkEncodeAllDict65536_0/length-68013-level-default-dict-1#03-32 67.25 66.61 0.99x BenchmarkEncodeAllDict65536_0/length-68013-level-better-dict-1#03-32 43.24 92.94 2.15x ``` --- zstd/dict_test.go | 78 ++++-- zstd/enc_base.go | 4 + zstd/enc_better.go | 588 +++++++++++++++++++++++++++++++++++++++- zstd/enc_dfast.go | 414 +++++++++++++++++++++++++++- zstd/enc_fast.go | 371 ++++++++++++++++++++++++- zstd/encoder_options.go | 14 +- 6 files changed, 1426 insertions(+), 43 deletions(-) diff --git a/zstd/dict_test.go b/zstd/dict_test.go index 433349c5d4..e5cf6ac983 100644 --- a/zstd/dict_test.go +++ b/zstd/dict_test.go @@ -218,7 +218,7 @@ func TestEncoder_SmallDict(t *testing.T) { } } -func BenchmarkEncodeAllDict(b *testing.B) { +func benchmarkEncodeAllLimitedBySize(b *testing.B, lowerLimit int, upperLimit int) { fn := "testdata/dict-tests-small.zip" data, err := ioutil.ReadFile(fn) t := testing.TB(b) @@ -232,7 +232,6 @@ func BenchmarkEncodeAllDict(b *testing.B) { } var dicts [][]byte var encs []*Encoder - var noDictEncs []*Encoder var encNames []string for _, tt := range zr.File { @@ -251,21 +250,16 @@ func BenchmarkEncodeAllDict(b *testing.B) { } dicts = append(dicts, in) for level := SpeedFastest; level < speedLast; level++ { - enc, err := NewWriter(nil, WithEncoderConcurrency(1), WithEncoderDict(in), WithEncoderLevel(level), WithWindowSize(1<<17)) + enc, err := NewWriter(nil, WithEncoderDict(in), WithEncoderLevel(level)) if err != nil { t.Fatal(err) } encs = append(encs, enc) encNames = append(encNames, fmt.Sprint("level-", level.String(), "-dict-", len(dicts))) - - enc, err = NewWriter(nil, WithEncoderConcurrency(1), WithEncoderLevel(level), WithWindowSize(1<<17)) - if err != nil { - t.Fatal(err) - } - noDictEncs = append(noDictEncs, enc) } }() } + const nPer = int(speedLast - SpeedFastest) dec, err := NewReader(nil, WithDecoderConcurrency(1), WithDecoderDicts(dicts...)) if err != nil { t.Fatal(err) @@ -273,10 +267,8 @@ func BenchmarkEncodeAllDict(b *testing.B) { } defer dec.Close() - for i, tt := range zr.File { - if i == 5 { - break - } + tested := make(map[int]struct{}) + for j, tt := range zr.File { if !strings.HasSuffix(tt.Name, ".zst") { continue } @@ -293,26 +285,64 @@ func BenchmarkEncodeAllDict(b *testing.B) { if err != nil { t.Fatal(err) } + + // Only test each size once + if _, ok := tested[len(decoded)]; ok { + continue + } + tested[len(decoded)] = struct{}{} + + if len(decoded) < lowerLimit { + continue + } + + if upperLimit > 0 && len(decoded) > upperLimit { + continue + } + for i := range encs { - // Only do 1 dict (3 encoders) for now. - if i == 3 { + // Only do 1 dict (4 encoders) for now. + if i == nPer-1 { break } // Attempt to compress with all dicts - var dst []byte - enc := encs[i] - b.Run(fmt.Sprintf("length-%d-%s", len(decoded), encNames[i]), func(b *testing.B) { - b.SetBytes(int64(len(decoded))) - b.ResetTimer() - b.ReportAllocs() - for i := 0; i < b.N; i++ { - dst = enc.EncodeAll(decoded, dst[:0]) - } + encIdx := (i + j*nPer) % len(encs) + enc := encs[encIdx] + b.Run(fmt.Sprintf("length-%d-%s", len(decoded), encNames[encIdx]), func(b *testing.B) { + b.RunParallel(func(pb *testing.PB) { + dst := make([]byte, 0, len(decoded)+10) + b.SetBytes(int64(len(decoded))) + b.ResetTimer() + b.ReportAllocs() + for pb.Next() { + dst = enc.EncodeAll(decoded, dst[:0]) + } + }) }) } } } +func BenchmarkEncodeAllDict0_1024(b *testing.B) { + benchmarkEncodeAllLimitedBySize(b, 0, 1024) +} + +func BenchmarkEncodeAllDict1024_8192(b *testing.B) { + benchmarkEncodeAllLimitedBySize(b, 1024, 8192) +} + +func BenchmarkEncodeAllDict8192_16384(b *testing.B) { + benchmarkEncodeAllLimitedBySize(b, 8192, 16384) +} + +func BenchmarkEncodeAllDict16384_65536(b *testing.B) { + benchmarkEncodeAllLimitedBySize(b, 16384, 65536) +} + +func BenchmarkEncodeAllDict65536_0(b *testing.B) { + benchmarkEncodeAllLimitedBySize(b, 65536, 0) +} + func TestDecoder_MoreDicts(t *testing.T) { // All files have CRC // https://files.klauspost.com/compress/zstd-dict-tests.zip diff --git a/zstd/enc_base.go b/zstd/enc_base.go index e3f01a10d0..2d4d893eb9 100644 --- a/zstd/enc_base.go +++ b/zstd/enc_base.go @@ -7,6 +7,10 @@ import ( "github.com/klauspost/compress/zstd/internal/xxhash" ) +const ( + dictShardBits = 6 +) + type fastBase struct { // cur is the offset at the start of hist cur int32 diff --git a/zstd/enc_better.go b/zstd/enc_better.go index 1133d88814..c2ce4a2bac 100644 --- a/zstd/enc_better.go +++ b/zstd/enc_better.go @@ -16,6 +16,12 @@ const ( // This greatly depends on the type of input. betterShortTableBits = 13 // Bits used in the short match table betterShortTableSize = 1 << betterShortTableBits // Size of the table + + betterLongTableShardCnt = 1 << (betterLongTableBits - dictShardBits) // Number of shards in the table + betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard + + betterShortTableShardCnt = 1 << (betterShortTableBits - dictShardBits) // Number of shards in the table + betterShortTableShardSize = betterShortTableSize / betterShortTableShardCnt // Size of an individual shard ) type prevEntry struct { @@ -31,10 +37,17 @@ type prevEntry struct { // and that it is longer (lazy matching). type betterFastEncoder struct { fastBase - table [betterShortTableSize]tableEntry - longTable [betterLongTableSize]prevEntry - dictTable []tableEntry - dictLongTable []prevEntry + table [betterShortTableSize]tableEntry + longTable [betterLongTableSize]prevEntry +} + +type betterFastEncoderDict struct { + betterFastEncoder + dictTable []tableEntry + dictLongTable []prevEntry + shortTableShardDirty [betterShortTableShardCnt]bool + longTableShardDirty [betterLongTableShardCnt]bool + allDirty bool } // Encode improves compression... @@ -520,8 +533,507 @@ func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) { e.Encode(blk, src) } +// Encode improves compression... +func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) { + const ( + // Input margin is the number of bytes we read (8) + // and the maximum we will read ahead (2) + inputMargin = 8 + 2 + minNonLiteralBlockSize = 16 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.longTable[:] { + e.longTable[i] = prevEntry{} + } + e.cur = e.maxMatchOff + e.allDirty = true + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff + for i := range e.table[:] { + v := e.table[i].offset + if v < minOff { + v = 0 + } else { + v = v - e.cur + e.maxMatchOff + } + e.table[i].offset = v + } + for i := range e.longTable[:] { + v := e.longTable[i].offset + v2 := e.longTable[i].prev + if v < minOff { + v = 0 + v2 = 0 + } else { + v = v - e.cur + e.maxMatchOff + if v2 < minOff { + v2 = 0 + } else { + v2 = v2 - e.cur + e.maxMatchOff + } + } + e.longTable[i] = prevEntry{ + offset: v, + prev: v2, + } + } + e.allDirty = true + e.cur = e.maxMatchOff + break + } + + s := e.addBlock(src) + blk.size = len(src) + if len(src) < minNonLiteralBlockSize { + blk.extraLits = len(src) + blk.literals = blk.literals[:len(src)] + copy(blk.literals, src) + return + } + + // Override src + src = e.hist + sLimit := int32(len(src)) - inputMargin + // stepSize is the number of bytes to skip on every main loop iteration. + // It should be >= 1. + const stepSize = 1 + + const kSearchStrength = 9 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := s + cv := load6432(src, s) + + // Relative offsets + offset1 := int32(blk.recentOffsets[0]) + offset2 := int32(blk.recentOffsets[1]) + + addLiterals := func(s *seq, until int32) { + if until == nextEmit { + return + } + blk.literals = append(blk.literals, src[nextEmit:until]...) + s.litLen = uint32(until - nextEmit) + } + if debug { + println("recent offsets:", blk.recentOffsets) + } + +encodeLoop: + for { + var t int32 + // We allow the encoder to optionally turn off repeat offsets across blocks + canRepeat := len(blk.sequences) > 2 + var matched int32 + + for { + if debugAsserts && canRepeat && offset1 == 0 { + panic("offset0 was 0") + } + + nextHashS := hash5(cv, betterShortTableBits) + nextHashL := hash8(cv, betterLongTableBits) + candidateL := e.longTable[nextHashL] + candidateS := e.table[nextHashS] + + const repOff = 1 + repIndex := s - offset1 + repOff + off := s + e.cur + e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset} + e.markLongShardDirty(nextHashL) + e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)} + e.markShortShardDirty(nextHashS) + + if canRepeat { + if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) { + // Consider history as well. + var seq seq + lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src) + + seq.matchLen = uint32(lenght - zstdMinMatch) + + // We might be able to match backwards. + // Extend as long as we can. + start := s + repOff + // We end the search early, so we don't risk 0 literals + // and have to do special offset treatment. + startLimit := nextEmit + 1 + + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 { + repIndex-- + start-- + seq.matchLen++ + } + addLiterals(&seq, start) + + // rep 0 + seq.offset = 1 + if debugSequences { + println("repeat sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + + // Index match start+1 (long) -> s - 1 + index0 := s + repOff + s += lenght + repOff + + nextEmit = s + if s >= sLimit { + if debug { + println("repeat ended", s, lenght) + + } + break encodeLoop + } + // Index skipped... + for index0 < s-1 { + cv0 := load6432(src, index0) + cv1 := cv0 >> 8 + h0 := hash8(cv0, betterLongTableBits) + off := index0 + e.cur + e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset} + e.markLongShardDirty(h0) + h1 := hash5(cv1, betterShortTableBits) + e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)} + e.markShortShardDirty(h1) + index0 += 2 + } + cv = load6432(src, s) + continue + } + const repOff2 = 1 + + // We deviate from the reference encoder and also check offset 2. + // Still slower and not much better, so disabled. + // repIndex = s - offset2 + repOff2 + if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) { + // Consider history as well. + var seq seq + lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src) + + seq.matchLen = uint32(lenght - zstdMinMatch) + + // We might be able to match backwards. + // Extend as long as we can. + start := s + repOff2 + // We end the search early, so we don't risk 0 literals + // and have to do special offset treatment. + startLimit := nextEmit + 1 + + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 { + repIndex-- + start-- + seq.matchLen++ + } + addLiterals(&seq, start) + + // rep 2 + seq.offset = 2 + if debugSequences { + println("repeat sequence 2", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + + index0 := s + repOff2 + s += lenght + repOff2 + nextEmit = s + if s >= sLimit { + if debug { + println("repeat ended", s, lenght) + + } + break encodeLoop + } + + // Index skipped... + for index0 < s-1 { + cv0 := load6432(src, index0) + cv1 := cv0 >> 8 + h0 := hash8(cv0, betterLongTableBits) + off := index0 + e.cur + e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset} + e.markLongShardDirty(h0) + h1 := hash5(cv1, betterShortTableBits) + e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)} + e.markShortShardDirty(h1) + index0 += 2 + } + cv = load6432(src, s) + // Swap offsets + offset1, offset2 = offset2, offset1 + continue + } + } + // Find the offsets of our two matches. + coffsetL := candidateL.offset - e.cur + coffsetLP := candidateL.prev - e.cur + + // Check if we have a long match. + if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) { + // Found a long match, at least 8 bytes. + matched = e.matchlen(s+8, coffsetL+8, src) + 8 + t = coffsetL + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + if debugAsserts && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debugMatches { + println("long match") + } + + if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) { + // Found a long match, at least 8 bytes. + prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8 + if prevMatch > matched { + matched = prevMatch + t = coffsetLP + } + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + if debugAsserts && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debugMatches { + println("long match") + } + } + break + } + + // Check if we have a long match on prev. + if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) { + // Found a long match, at least 8 bytes. + matched = e.matchlen(s+8, coffsetLP+8, src) + 8 + t = coffsetLP + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + if debugAsserts && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debugMatches { + println("long match") + } + break + } + + coffsetS := candidateS.offset - e.cur + + // Check if we have a short match. + if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val { + // found a regular match + matched = e.matchlen(s+4, coffsetS+4, src) + 4 + + // See if we can find a long match at s+1 + const checkAt = 1 + cv := load6432(src, s+checkAt) + nextHashL = hash8(cv, betterLongTableBits) + candidateL = e.longTable[nextHashL] + coffsetL = candidateL.offset - e.cur + + // We can store it, since we have at least a 4 byte match. + e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset} + e.markLongShardDirty(nextHashL) + if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) { + // Found a long match, at least 8 bytes. + matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8 + if matchedNext > matched { + t = coffsetL + s += checkAt + matched = matchedNext + if debugMatches { + println("long match (after short)") + } + break + } + } + + // Check prev long... + coffsetL = candidateL.prev - e.cur + if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) { + // Found a long match, at least 8 bytes. + matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8 + if matchedNext > matched { + t = coffsetL + s += checkAt + matched = matchedNext + if debugMatches { + println("prev long match (after short)") + } + break + } + } + t = coffsetS + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + if debugAsserts && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debugAsserts && t < 0 { + panic("t<0") + } + if debugMatches { + println("short match") + } + break + } + + // No match found, move forward in input. + s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1)) + if s >= sLimit { + break encodeLoop + } + cv = load6432(src, s) + } + + // A 4-byte match has been found. Update recent offsets. + // We'll later see if more than 4 bytes. + offset2 = offset1 + offset1 = s - t + + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + + if debugAsserts && canRepeat && int(offset1) > len(src) { + panic("invalid offset") + } + + // Extend the n-byte match as long as possible. + l := matched + + // Extend backwards + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength { + s-- + t-- + l++ + } + + // Write our sequence + var seq seq + seq.litLen = uint32(s - nextEmit) + seq.matchLen = uint32(l - zstdMinMatch) + if seq.litLen > 0 { + blk.literals = append(blk.literals, src[nextEmit:s]...) + } + seq.offset = uint32(s-t) + 3 + s += l + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + nextEmit = s + if s >= sLimit { + break encodeLoop + } + + // Index match start+1 (long) -> s - 1 + index0 := s - l + 1 + for index0 < s-1 { + cv0 := load6432(src, index0) + cv1 := cv0 >> 8 + h0 := hash8(cv0, betterLongTableBits) + off := index0 + e.cur + e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset} + e.markLongShardDirty(h0) + h1 := hash5(cv1, betterShortTableBits) + e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)} + e.markShortShardDirty(h1) + index0 += 2 + } + + cv = load6432(src, s) + if !canRepeat { + continue + } + + // Check offset 2 + for { + o2 := s - offset2 + if load3232(src, o2) != uint32(cv) { + // Do regular search + break + } + + // Store this, since we have it. + nextHashS := hash5(cv, betterShortTableBits) + nextHashL := hash8(cv, betterLongTableBits) + + // We have at least 4 byte match. + // No need to check backwards. We come straight from a match + l := 4 + e.matchlen(s+4, o2+4, src) + + e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset} + e.markLongShardDirty(nextHashL) + e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)} + e.markShortShardDirty(nextHashS) + seq.matchLen = uint32(l) - zstdMinMatch + seq.litLen = 0 + + // Since litlen is always 0, this is offset 1. + seq.offset = 1 + s += l + nextEmit = s + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + + // Swap offset 1 and 2. + offset1, offset2 = offset2, offset1 + if s >= sLimit { + // Finished + break encodeLoop + } + cv = load6432(src, s) + } + } + + if int(nextEmit) < len(src) { + blk.literals = append(blk.literals, src[nextEmit:]...) + blk.extraLits = len(src) - int(nextEmit) + } + blk.recentOffsets[0] = uint32(offset1) + blk.recentOffsets[1] = uint32(offset2) + if debug { + println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits) + } +} + // ResetDict will reset and set a dictionary if not nil func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) { + e.resetBase(d, singleBlock) + if d != nil { + panic("betterFastEncoder: Reset with dict") + } +} + +// ResetDict will reset and set a dictionary if not nil +func (e *betterFastEncoderDict) Reset(d *dict, singleBlock bool) { e.resetBase(d, singleBlock) if d == nil { return @@ -558,6 +1070,7 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) { } } e.lastDictID = d.id + e.allDirty = true } // Init or copy dict table @@ -586,11 +1099,72 @@ func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) { } } e.lastDictID = d.id + e.allDirty = true } + // Reset table to initial state - copy(e.longTable[:], e.dictLongTable) + { + dirtyShardCnt := 0 + if !e.allDirty { + for i := range e.shortTableShardDirty { + if e.shortTableShardDirty[i] { + dirtyShardCnt++ + } + } + } + const shardCnt = betterShortTableShardCnt + const shardSize = betterShortTableShardSize + if e.allDirty || dirtyShardCnt > shardCnt*4/6 { + copy(e.table[:], e.dictTable) + for i := range e.shortTableShardDirty { + e.shortTableShardDirty[i] = false + } + } else { + for i := range e.shortTableShardDirty { + if !e.shortTableShardDirty[i] { + continue + } + copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize]) + e.shortTableShardDirty[i] = false + } + } + } + { + dirtyShardCnt := 0 + if !e.allDirty { + for i := range e.shortTableShardDirty { + if e.shortTableShardDirty[i] { + dirtyShardCnt++ + } + } + } + const shardCnt = betterLongTableShardCnt + const shardSize = betterLongTableShardSize + if e.allDirty || dirtyShardCnt > shardCnt*4/6 { + copy(e.longTable[:], e.dictLongTable) + for i := range e.longTableShardDirty { + e.longTableShardDirty[i] = false + } + } else { + for i := range e.longTableShardDirty { + if !e.longTableShardDirty[i] { + continue + } + + copy(e.longTable[i*shardSize:(i+1)*shardSize], e.dictLongTable[i*shardSize:(i+1)*shardSize]) + e.longTableShardDirty[i] = false + } + } + } e.cur = e.maxMatchOff - // Reset table to initial state - copy(e.table[:], e.dictTable) + e.allDirty = false +} + +func (e *betterFastEncoderDict) markLongShardDirty(entryNum uint32) { + e.longTableShardDirty[entryNum/betterLongTableShardSize] = true +} + +func (e *betterFastEncoderDict) markShortShardDirty(entryNum uint32) { + e.shortTableShardDirty[entryNum/betterShortTableShardSize] = true } diff --git a/zstd/enc_dfast.go b/zstd/enc_dfast.go index 19eebf66e5..8629d43d86 100644 --- a/zstd/enc_dfast.go +++ b/zstd/enc_dfast.go @@ -11,6 +11,9 @@ const ( dFastLongTableSize = 1 << dFastLongTableBits // Size of the table dFastLongTableMask = dFastLongTableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks. + dLongTableShardCnt = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table + dLongTableShardSize = dFastLongTableSize / tableShardCnt // Size of an individual shard + dFastShortTableBits = tableBits // Bits used in the short match table dFastShortTableSize = 1 << dFastShortTableBits // Size of the table dFastShortTableMask = dFastShortTableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks. @@ -18,8 +21,14 @@ const ( type doubleFastEncoder struct { fastEncoder - longTable [dFastLongTableSize]tableEntry - dictLongTable []tableEntry + longTable [dFastLongTableSize]tableEntry +} + +type doubleFastEncoderDict struct { + fastEncoderDict + longTable [dFastLongTableSize]tableEntry + dictLongTable []tableEntry + longTableShardDirty [dLongTableShardCnt]bool } // Encode mimmics functionality in zstd_dfast.c @@ -678,9 +687,379 @@ encodeLoop: } } +// Encode will encode the content, with a dictionary if initialized for it. +func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) { + const ( + // Input margin is the number of bytes we read (8) + // and the maximum we will read ahead (2) + inputMargin = 8 + 2 + minNonLiteralBlockSize = 16 + ) + + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.longTable[:] { + e.longTable[i] = tableEntry{} + } + e.markAllShardsDirty() + e.cur = e.maxMatchOff + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff + for i := range e.table[:] { + v := e.table[i].offset + if v < minOff { + v = 0 + } else { + v = v - e.cur + e.maxMatchOff + } + e.table[i].offset = v + } + for i := range e.longTable[:] { + v := e.longTable[i].offset + if v < minOff { + v = 0 + } else { + v = v - e.cur + e.maxMatchOff + } + e.longTable[i].offset = v + } + e.markAllShardsDirty() + e.cur = e.maxMatchOff + break + } + + s := e.addBlock(src) + blk.size = len(src) + if len(src) < minNonLiteralBlockSize { + blk.extraLits = len(src) + blk.literals = blk.literals[:len(src)] + copy(blk.literals, src) + return + } + + // Override src + src = e.hist + sLimit := int32(len(src)) - inputMargin + // stepSize is the number of bytes to skip on every main loop iteration. + // It should be >= 1. + const stepSize = 1 + + const kSearchStrength = 8 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := s + cv := load6432(src, s) + + // Relative offsets + offset1 := int32(blk.recentOffsets[0]) + offset2 := int32(blk.recentOffsets[1]) + + addLiterals := func(s *seq, until int32) { + if until == nextEmit { + return + } + blk.literals = append(blk.literals, src[nextEmit:until]...) + s.litLen = uint32(until - nextEmit) + } + if debug { + println("recent offsets:", blk.recentOffsets) + } + +encodeLoop: + for { + var t int32 + // We allow the encoder to optionally turn off repeat offsets across blocks + canRepeat := len(blk.sequences) > 2 + + for { + if debugAsserts && canRepeat && offset1 == 0 { + panic("offset0 was 0") + } + + nextHashS := hash5(cv, dFastShortTableBits) + nextHashL := hash8(cv, dFastLongTableBits) + candidateL := e.longTable[nextHashL] + candidateS := e.table[nextHashS] + + const repOff = 1 + repIndex := s - offset1 + repOff + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.longTable[nextHashL] = entry + e.markLongShardDirty(nextHashL) + e.table[nextHashS] = entry + e.markShardDirty(nextHashS) + + if canRepeat { + if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) { + // Consider history as well. + var seq seq + lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src) + + seq.matchLen = uint32(lenght - zstdMinMatch) + + // We might be able to match backwards. + // Extend as long as we can. + start := s + repOff + // We end the search early, so we don't risk 0 literals + // and have to do special offset treatment. + startLimit := nextEmit + 1 + + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 { + repIndex-- + start-- + seq.matchLen++ + } + addLiterals(&seq, start) + + // rep 0 + seq.offset = 1 + if debugSequences { + println("repeat sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + s += lenght + repOff + nextEmit = s + if s >= sLimit { + if debug { + println("repeat ended", s, lenght) + + } + break encodeLoop + } + cv = load6432(src, s) + continue + } + } + // Find the offsets of our two matches. + coffsetL := s - (candidateL.offset - e.cur) + coffsetS := s - (candidateS.offset - e.cur) + + // Check if we have a long match. + if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val { + // Found a long match, likely at least 8 bytes. + // Reference encoder checks all 8 bytes, we only check 4, + // but the likelihood of both the first 4 bytes and the hash matching should be enough. + t = candidateL.offset - e.cur + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + if debugAsserts && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debugMatches { + println("long match") + } + break + } + + // Check if we have a short match. + if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val { + // found a regular match + // See if we can find a long match at s+1 + const checkAt = 1 + cv := load6432(src, s+checkAt) + nextHashL = hash8(cv, dFastLongTableBits) + candidateL = e.longTable[nextHashL] + coffsetL = s - (candidateL.offset - e.cur) + checkAt + + // We can store it, since we have at least a 4 byte match. + e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)} + e.markLongShardDirty(nextHashL) + if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val { + // Found a long match, likely at least 8 bytes. + // Reference encoder checks all 8 bytes, we only check 4, + // but the likelihood of both the first 4 bytes and the hash matching should be enough. + t = candidateL.offset - e.cur + s += checkAt + if debugMatches { + println("long match (after short)") + } + break + } + + t = candidateS.offset - e.cur + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + if debugAsserts && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debugAsserts && t < 0 { + panic("t<0") + } + if debugMatches { + println("short match") + } + break + } + + // No match found, move forward in input. + s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1)) + if s >= sLimit { + break encodeLoop + } + cv = load6432(src, s) + } + + // A 4-byte match has been found. Update recent offsets. + // We'll later see if more than 4 bytes. + offset2 = offset1 + offset1 = s - t + + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + + if debugAsserts && canRepeat && int(offset1) > len(src) { + panic("invalid offset") + } + + // Extend the 4-byte match as long as possible. + l := e.matchlen(s+4, t+4, src) + 4 + + // Extend backwards + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength { + s-- + t-- + l++ + } + + // Write our sequence + var seq seq + seq.litLen = uint32(s - nextEmit) + seq.matchLen = uint32(l - zstdMinMatch) + if seq.litLen > 0 { + blk.literals = append(blk.literals, src[nextEmit:s]...) + } + seq.offset = uint32(s-t) + 3 + s += l + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + nextEmit = s + if s >= sLimit { + break encodeLoop + } + + // Index match start+1 (long) and start+2 (short) + index0 := s - l + 1 + // Index match end-2 (long) and end-1 (short) + index1 := s - 2 + + cv0 := load6432(src, index0) + cv1 := load6432(src, index1) + te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)} + te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)} + longHash1 := hash8(cv0, dFastLongTableBits) + longHash2 := hash8(cv0, dFastLongTableBits) + e.longTable[longHash1] = te0 + e.longTable[longHash2] = te1 + e.markLongShardDirty(longHash1) + e.markLongShardDirty(longHash2) + cv0 >>= 8 + cv1 >>= 8 + te0.offset++ + te1.offset++ + te0.val = uint32(cv0) + te1.val = uint32(cv1) + hashVal1 := hash5(cv0, dFastShortTableBits) + hashVal2 := hash5(cv1, dFastShortTableBits) + e.table[hashVal1] = te0 + e.markShardDirty(hashVal1) + e.table[hashVal2] = te1 + e.markShardDirty(hashVal2) + + cv = load6432(src, s) + + if !canRepeat { + continue + } + + // Check offset 2 + for { + o2 := s - offset2 + if load3232(src, o2) != uint32(cv) { + // Do regular search + break + } + + // Store this, since we have it. + nextHashS := hash5(cv, dFastShortTableBits) + nextHashL := hash8(cv, dFastLongTableBits) + + // We have at least 4 byte match. + // No need to check backwards. We come straight from a match + l := 4 + e.matchlen(s+4, o2+4, src) + + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.longTable[nextHashL] = entry + e.markLongShardDirty(nextHashL) + e.table[nextHashS] = entry + e.markShardDirty(nextHashS) + seq.matchLen = uint32(l) - zstdMinMatch + seq.litLen = 0 + + // Since litlen is always 0, this is offset 1. + seq.offset = 1 + s += l + nextEmit = s + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + + // Swap offset 1 and 2. + offset1, offset2 = offset2, offset1 + if s >= sLimit { + // Finished + break encodeLoop + } + cv = load6432(src, s) + } + } + + if int(nextEmit) < len(src) { + blk.literals = append(blk.literals, src[nextEmit:]...) + blk.extraLits = len(src) - int(nextEmit) + } + blk.recentOffsets[0] = uint32(offset1) + blk.recentOffsets[1] = uint32(offset2) + if debug { + println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits) + } + // If we encoded more than 64K mark all dirty. + if len(src) > 64<<10 { + e.markAllShardsDirty() + } +} + // ResetDict will reset and set a dictionary if not nil func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) { e.fastEncoder.Reset(d, singleBlock) + if d != nil { + panic("doubleFastEncoder: Reset with dict not supported") + } +} + +// ResetDict will reset and set a dictionary if not nil +func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) { + allDirty := e.allDirty + e.fastEncoderDict.Reset(d, singleBlock) if d == nil { return } @@ -706,8 +1085,37 @@ func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) { } } e.lastDictID = d.id + e.allDirty = true } // Reset table to initial state e.cur = e.maxMatchOff - copy(e.longTable[:], e.dictLongTable) + + dirtyShardCnt := 0 + if !allDirty { + for i := range e.longTableShardDirty { + if e.longTableShardDirty[i] { + dirtyShardCnt++ + } + } + } + + if allDirty || dirtyShardCnt > dLongTableShardCnt/2 { + copy(e.longTable[:], e.dictLongTable) + for i := range e.longTableShardDirty { + e.longTableShardDirty[i] = false + } + return + } + for i := range e.longTableShardDirty { + if !e.longTableShardDirty[i] { + continue + } + + copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize]) + e.longTableShardDirty[i] = false + } +} + +func (e *doubleFastEncoderDict) markLongShardDirty(entryNum uint32) { + e.longTableShardDirty[entryNum/dLongTableShardSize] = true } diff --git a/zstd/enc_fast.go b/zstd/enc_fast.go index 0045016d94..ba4a17e106 100644 --- a/zstd/enc_fast.go +++ b/zstd/enc_fast.go @@ -11,9 +11,11 @@ import ( ) const ( - tableBits = 15 // Bits used in the table - tableSize = 1 << tableBits // Size of the table - tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks. + tableBits = 15 // Bits used in the table + tableSize = 1 << tableBits // Size of the table + tableShardCnt = 1 << (tableBits - dictShardBits) // Number of shards in the table + tableShardSize = tableSize / tableShardCnt // Size of an individual shard + tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks. maxMatchLength = 131074 ) @@ -24,8 +26,14 @@ type tableEntry struct { type fastEncoder struct { fastBase - table [tableSize]tableEntry - dictTable []tableEntry + table [tableSize]tableEntry +} + +type fastEncoderDict struct { + fastEncoder + dictTable []tableEntry + tableShardDirty [tableShardCnt]bool + allDirty bool } // Encode mimmics functionality in zstd_fast.c @@ -617,8 +625,322 @@ encodeLoop: } } +// Encode will encode the content, with a dictionary if initialized for it. +func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) { + const ( + inputMargin = 8 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + if e.allDirty || len(src) > 32<<10 { + e.fastEncoder.Encode(blk, src) + e.allDirty = true + return + } + // Protect against e.cur wraparound. + for e.cur >= bufferReset { + if len(e.hist) == 0 { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = e.maxMatchOff + break + } + // Shift down everything in the table that isn't already too far away. + minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff + for i := range e.table[:] { + v := e.table[i].offset + if v < minOff { + v = 0 + } else { + v = v - e.cur + e.maxMatchOff + } + e.table[i].offset = v + } + e.cur = e.maxMatchOff + break + } + + s := e.addBlock(src) + blk.size = len(src) + if len(src) < minNonLiteralBlockSize { + blk.extraLits = len(src) + blk.literals = blk.literals[:len(src)] + copy(blk.literals, src) + return + } + + // Override src + src = e.hist + sLimit := int32(len(src)) - inputMargin + // stepSize is the number of bytes to skip on every main loop iteration. + // It should be >= 2. + const stepSize = 2 + + // TEMPLATE + const hashLog = tableBits + // seems global, but would be nice to tweak. + const kSearchStrength = 7 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := s + cv := load6432(src, s) + + // Relative offsets + offset1 := int32(blk.recentOffsets[0]) + offset2 := int32(blk.recentOffsets[1]) + + addLiterals := func(s *seq, until int32) { + if until == nextEmit { + return + } + blk.literals = append(blk.literals, src[nextEmit:until]...) + s.litLen = uint32(until - nextEmit) + } + if debug { + println("recent offsets:", blk.recentOffsets) + } + +encodeLoop: + for { + // t will contain the match offset when we find one. + // When existing the search loop, we have already checked 4 bytes. + var t int32 + + // We will not use repeat offsets across blocks. + // By not using them for the first 3 matches + canRepeat := len(blk.sequences) > 2 + + for { + if debugAsserts && canRepeat && offset1 == 0 { + panic("offset0 was 0") + } + + nextHash := hash6(cv, hashLog) + nextHash2 := hash6(cv>>8, hashLog) + candidate := e.table[nextHash] + candidate2 := e.table[nextHash2] + repIndex := s - offset1 + 2 + + e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)} + e.markShardDirty(nextHash) + e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)} + e.markShardDirty(nextHash2) + + if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) { + // Consider history as well. + var seq seq + var length int32 + // length = 4 + e.matchlen(s+6, repIndex+4, src) + { + a := src[s+6:] + b := src[repIndex+4:] + endI := len(a) & (math.MaxInt32 - 7) + length = int32(endI) + 4 + for i := 0; i < endI; i += 8 { + if diff := load64(a, i) ^ load64(b, i); diff != 0 { + length = int32(i+bits.TrailingZeros64(diff)>>3) + 4 + break + } + } + } + + seq.matchLen = uint32(length - zstdMinMatch) + + // We might be able to match backwards. + // Extend as long as we can. + start := s + 2 + // We end the search early, so we don't risk 0 literals + // and have to do special offset treatment. + startLimit := nextEmit + 1 + + sMin := s - e.maxMatchOff + if sMin < 0 { + sMin = 0 + } + for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch { + repIndex-- + start-- + seq.matchLen++ + } + addLiterals(&seq, start) + + // rep 0 + seq.offset = 1 + if debugSequences { + println("repeat sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + s += length + 2 + nextEmit = s + if s >= sLimit { + if debug { + println("repeat ended", s, length) + + } + break encodeLoop + } + cv = load6432(src, s) + continue + } + coffset0 := s - (candidate.offset - e.cur) + coffset1 := s - (candidate2.offset - e.cur) + 1 + if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val { + // found a regular match + t = candidate.offset - e.cur + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + if debugAsserts && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + break + } + + if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val { + // found a regular match + t = candidate2.offset - e.cur + s++ + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + if debugAsserts && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debugAsserts && t < 0 { + panic("t<0") + } + break + } + s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1)) + if s >= sLimit { + break encodeLoop + } + cv = load6432(src, s) + } + // A 4-byte match has been found. We'll later see if more than 4 bytes. + offset2 = offset1 + offset1 = s - t + + if debugAsserts && s <= t { + panic(fmt.Sprintf("s (%d) <= t (%d)", s, t)) + } + + if debugAsserts && canRepeat && int(offset1) > len(src) { + panic("invalid offset") + } + + // Extend the 4-byte match as long as possible. + //l := e.matchlen(s+4, t+4, src) + 4 + var l int32 + { + a := src[s+4:] + b := src[t+4:] + endI := len(a) & (math.MaxInt32 - 7) + l = int32(endI) + 4 + for i := 0; i < endI; i += 8 { + if diff := load64(a, i) ^ load64(b, i); diff != 0 { + l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 + break + } + } + } + + // Extend backwards + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength { + s-- + t-- + l++ + } + + // Write our sequence. + var seq seq + seq.litLen = uint32(s - nextEmit) + seq.matchLen = uint32(l - zstdMinMatch) + if seq.litLen > 0 { + blk.literals = append(blk.literals, src[nextEmit:s]...) + } + // Don't use repeat offsets + seq.offset = uint32(s-t) + 3 + s += l + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + nextEmit = s + if s >= sLimit { + break encodeLoop + } + cv = load6432(src, s) + + // Check offset 2 + if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) { + // We have at least 4 byte match. + // No need to check backwards. We come straight from a match + //l := 4 + e.matchlen(s+4, o2+4, src) + var l int32 + { + a := src[s+4:] + b := src[o2+4:] + endI := len(a) & (math.MaxInt32 - 7) + l = int32(endI) + 4 + for i := 0; i < endI; i += 8 { + if diff := load64(a, i) ^ load64(b, i); diff != 0 { + l = int32(i+bits.TrailingZeros64(diff)>>3) + 4 + break + } + } + } + + // Store this, since we have it. + nextHash := hash6(cv, hashLog) + e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)} + e.markShardDirty(nextHash) + seq.matchLen = uint32(l) - zstdMinMatch + seq.litLen = 0 + // Since litlen is always 0, this is offset 1. + seq.offset = 1 + s += l + nextEmit = s + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + + // Swap offset 1 and 2. + offset1, offset2 = offset2, offset1 + if s >= sLimit { + break encodeLoop + } + // Prepare next loop. + cv = load6432(src, s) + } + } + + if int(nextEmit) < len(src) { + blk.literals = append(blk.literals, src[nextEmit:]...) + blk.extraLits = len(src) - int(nextEmit) + } + blk.recentOffsets[0] = uint32(offset1) + blk.recentOffsets[1] = uint32(offset2) + if debug { + println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits) + } +} + // ResetDict will reset and set a dictionary if not nil func (e *fastEncoder) Reset(d *dict, singleBlock bool) { + e.resetBase(d, singleBlock) + if d != nil { + panic("fastEncoder: Reset with dict") + } +} + +// ResetDict will reset and set a dictionary if not nil +func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) { e.resetBase(d, singleBlock) if d == nil { return @@ -653,9 +975,44 @@ func (e *fastEncoder) Reset(d *dict, singleBlock bool) { } } e.lastDictID = d.id + e.allDirty = true } e.cur = e.maxMatchOff - // Reset table to initial state - copy(e.table[:], e.dictTable) + dirtyShardCnt := 0 + if !e.allDirty { + for i := range e.tableShardDirty { + if e.tableShardDirty[i] { + dirtyShardCnt++ + } + } + } + + const shardCnt = tableShardCnt + const shardSize = tableShardSize + if e.allDirty || dirtyShardCnt > shardCnt*4/6 { + copy(e.table[:], e.dictTable) + for i := range e.tableShardDirty { + e.tableShardDirty[i] = false + } + e.allDirty = false + return + } + for i := range e.tableShardDirty { + if !e.tableShardDirty[i] { + continue + } + + copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize]) + e.tableShardDirty[i] = false + } + e.allDirty = false +} + +func (e *fastEncoderDict) markAllShardsDirty() { + e.allDirty = true +} + +func (e *fastEncoderDict) markShardDirty(entryNum uint32) { + e.tableShardDirty[entryNum/tableShardSize] = true } diff --git a/zstd/encoder_options.go b/zstd/encoder_options.go index abd953697e..18a47eb03e 100644 --- a/zstd/encoder_options.go +++ b/zstd/encoder_options.go @@ -44,14 +44,24 @@ func (o *encoderOptions) setDefault() { // encoder returns an encoder with the selected options. func (o encoderOptions) encoder() encoder { switch o.level { + case SpeedFastest: + if o.dict != nil { + return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} + } + return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} + case SpeedDefault: + if o.dict != nil { + return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}} + } return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} case SpeedBetterCompression: + if o.dict != nil { + return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}} + } return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} case SpeedBestCompression: return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} - case SpeedFastest: - return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}} } panic("unknown compression level") }