Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zstd: Skip entropy on random data #270

Merged
merged 1 commit into from
Jun 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions zstd/blockenc.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,9 +444,9 @@ func fuzzFseEncoder(data []byte) int {
}

// encode will encode the block and append the output in b.output.
func (b *blockEnc) encode(raw bool) error {
func (b *blockEnc) encode(raw, rawAllLits bool) error {
if len(b.sequences) == 0 {
return b.encodeLits(raw)
return b.encodeLits(rawAllLits)
}
// We want some difference
if len(b.literals) > (b.size - (b.size >> 5)) {
Expand Down
6 changes: 3 additions & 3 deletions zstd/encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ func (e *Encoder) nextBlock(final bool) error {
// If we got the exact same number of literals as input,
// assume the literals cannot be compressed.
if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
err = blk.encode(e.o.noEntropy)
err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
}
switch err {
case errIncompressible:
Expand Down Expand Up @@ -491,7 +491,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
// Output directly to dst
blk.output = dst
err = blk.encode(e.o.noEntropy)
err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
}

switch err {
Expand Down Expand Up @@ -528,7 +528,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
// If we got the exact same number of literals as input,
// assume the literals cannot be compressed.
if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
err = blk.encode(e.o.noEntropy)
err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
}

switch err {
Expand Down
38 changes: 28 additions & 10 deletions zstd/encoder_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,18 @@ type EOption func(*encoderOptions) error

// options retains accumulated state of multiple options.
type encoderOptions struct {
concurrent int
level EncoderLevel
single *bool
pad int
blockSize int
windowSize int
crc bool
fullZero bool
noEntropy bool
customWindow bool
concurrent int
level EncoderLevel
single *bool
pad int
blockSize int
windowSize int
crc bool
fullZero bool
noEntropy bool
allLitEntropy bool
customWindow bool
customALEntropy bool
}

func (o *encoderOptions) setDefault() {
Expand Down Expand Up @@ -207,6 +209,10 @@ func WithEncoderLevel(l EncoderLevel) EOption {
o.windowSize = 16 << 20
}
}
if !o.customALEntropy {
o.allLitEntropy = l > SpeedFastest
}

return nil
}
}
Expand All @@ -221,6 +227,18 @@ func WithZeroFrames(b bool) EOption {
}
}

// WithAllLitEntropyCompression will apply entropy compression if no matches are found.
// Disabling this will skip incompressible data faster, but in cases with no matches but
// skewed character distribution compression is lost.
// Default value depends on the compression level selected.
func WithAllLitEntropyCompression(b bool) EOption {
return func(o *encoderOptions) error {
o.customALEntropy = true
o.allLitEntropy = b
return nil
}
}

// WithNoEntropyCompression will always skip entropy compression of literals.
// This can be useful if content has matches, but unlikely to benefit from entropy
// compression. Usually the slight speed improvement is not worth enabling this.
Expand Down
54 changes: 43 additions & 11 deletions zstd/encoder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -927,9 +927,9 @@ func BenchmarkEncoder_EncodeAllPi(b *testing.B) {
}
}

func BenchmarkRandomEncodeAllFastest(b *testing.B) {
func BenchmarkRandom4KEncodeAllFastest(b *testing.B) {
rng := rand.New(rand.NewSource(1))
data := make([]byte, 10<<20)
data := make([]byte, 4<<10)
for i := range data {
data[i] = uint8(rng.Intn(256))
}
Expand All @@ -948,12 +948,29 @@ func BenchmarkRandomEncodeAllFastest(b *testing.B) {
}
}

func BenchmarkRandomEncodeAllDefault(b *testing.B) {
func BenchmarkRandom10MBEncodeAllFastest(b *testing.B) {
rng := rand.New(rand.NewSource(1))
data := make([]byte, 10<<20)
for i := range data {
data[i] = uint8(rng.Intn(256))
rng.Read(data)
enc, _ := NewWriter(nil, WithEncoderLevel(SpeedFastest), WithEncoderConcurrency(1))
defer enc.Close()
dst := enc.EncodeAll(data, nil)
wantSize := len(dst)
b.ResetTimer()
b.ReportAllocs()
b.SetBytes(int64(len(data)))
for i := 0; i < b.N; i++ {
dst := enc.EncodeAll(data, dst[:0])
if len(dst) != wantSize {
b.Fatal(len(dst), "!=", wantSize)
}
}
}

func BenchmarkRandom4KEncodeAllDefault(b *testing.B) {
rng := rand.New(rand.NewSource(1))
data := make([]byte, 4<<10)
rng.Read(data)
enc, _ := NewWriter(nil, WithEncoderLevel(SpeedDefault), WithEncoderConcurrency(1))
defer enc.Close()
dst := enc.EncodeAll(data, nil)
Expand All @@ -969,12 +986,29 @@ func BenchmarkRandomEncodeAllDefault(b *testing.B) {
}
}

func BenchmarkRandomEncoderFastest(b *testing.B) {
func BenchmarkRandomEncodeAllDefault(b *testing.B) {
rng := rand.New(rand.NewSource(1))
data := make([]byte, 10<<20)
for i := range data {
data[i] = uint8(rng.Intn(256))
rng.Read(data)
enc, _ := NewWriter(nil, WithEncoderLevel(SpeedDefault), WithEncoderConcurrency(1))
defer enc.Close()
dst := enc.EncodeAll(data, nil)
wantSize := len(dst)
b.ResetTimer()
b.ReportAllocs()
b.SetBytes(int64(len(data)))
for i := 0; i < b.N; i++ {
dst := enc.EncodeAll(data, dst[:0])
if len(dst) != wantSize {
b.Fatal(len(dst), "!=", wantSize)
}
}
}

func BenchmarkRandom10MBEncoderFastest(b *testing.B) {
rng := rand.New(rand.NewSource(1))
data := make([]byte, 10<<20)
rng.Read(data)
wantSize := int64(len(data))
enc, _ := NewWriter(ioutil.Discard, WithEncoderLevel(SpeedFastest))
defer enc.Close()
Expand Down Expand Up @@ -1003,9 +1037,7 @@ func BenchmarkRandomEncoderFastest(b *testing.B) {
func BenchmarkRandomEncoderDefault(b *testing.B) {
rng := rand.New(rand.NewSource(1))
data := make([]byte, 10<<20)
for i := range data {
data[i] = uint8(rng.Intn(256))
}
rng.Read(data)
wantSize := int64(len(data))
enc, _ := NewWriter(ioutil.Discard, WithEncoderLevel(SpeedDefault))
defer enc.Close()
Expand Down
2 changes: 1 addition & 1 deletion zstd/snappy.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
r.err = ErrSnappyCorrupt
return written, r.err
}
err = r.block.encode(false)
err = r.block.encode(false, false)
switch err {
case errIncompressible:
r.block.popOffsets()
Expand Down