Skip to content

Commit

Permalink
deflate: Better Huffman encoding (#374)
Browse files Browse the repository at this point in the history
* deflate: Better Huffman encoding

Speed up and improve huffman compression:

```
λ benchcmp before.txt after.txt
benchmark                               old ns/op     new ns/op     delta
BenchmarkEncodeDigitsConstant1e4-32     32925         20138         -38.84%
BenchmarkEncodeDigitsConstant1e5-32     425414        218386        -48.67%
BenchmarkEncodeDigitsConstant1e6-32     4261446       1866023       -56.21%
BenchmarkEncodeDigitsSpeed1e4-32        66777         60683         -9.13%
BenchmarkEncodeDigitsSpeed1e5-32        855737        807328        -5.66%
BenchmarkEncodeDigitsSpeed1e6-32        8584307       7505546       -12.57%
BenchmarkEncodeDigitsDefault1e4-32      124753        123101        -1.32%
BenchmarkEncodeDigitsDefault1e5-32      1536784       1507136       -1.93%
BenchmarkEncodeDigitsDefault1e6-32      15765790      14838850      -5.88%
BenchmarkEncodeDigitsCompress1e4-32     185589        186598        +0.54%
BenchmarkEncodeDigitsCompress1e5-32     3264706       3277041       +0.38%
BenchmarkEncodeDigitsCompress1e6-32     35219900      35308128      +0.25%
BenchmarkEncodeDigitsSL1e4-32           59526         54858         -7.84%
BenchmarkEncodeDigitsSL1e5-32           916883        896292        -2.25%
BenchmarkEncodeDigitsSL1e6-32           9180701       8873708       -3.34%
BenchmarkEncodeTwainConstant1e4-32      41059         29454         -28.26%
BenchmarkEncodeTwainConstant1e5-32      486514        248799        -48.86%
BenchmarkEncodeTwainConstant1e6-32      3938046       2547548       -35.31%
BenchmarkEncodeTwainSpeed1e4-32         87027         82783         -4.88%
BenchmarkEncodeTwainSpeed1e5-32         851805        803264        -5.70%
BenchmarkEncodeTwainSpeed1e6-32         7885728       7452326       -5.50%
BenchmarkEncodeTwainDefault1e4-32       126807        126695        -0.09%
BenchmarkEncodeTwainDefault1e5-32       1371597       1373745       +0.16%
BenchmarkEncodeTwainDefault1e6-32       13067533      13027351      -0.31%
BenchmarkEncodeTwainCompress1e4-32      237083        234776        -0.97%
BenchmarkEncodeTwainCompress1e5-32      4430928       4396044       -0.79%
BenchmarkEncodeTwainCompress1e6-32      48377762      48015133      -0.75%
BenchmarkEncodeTwainSL1e4-32            80816         81162         +0.43%
BenchmarkEncodeTwainSL1e5-32            889941        868247        -2.44%
BenchmarkEncodeTwainSL1e6-32            8740752       8356943       -4.39%
```

* Use only lookup to get offset bits+base.

* Store the offsetCode inside the token.
  • Loading branch information
klauspost authored May 19, 2021
1 parent 2748482 commit 6274b7e
Show file tree
Hide file tree
Showing 11 changed files with 230 additions and 101 deletions.
2 changes: 1 addition & 1 deletion flate/deflate.go
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
d.fill = (*compressor).fillBlock
d.step = (*compressor).store
case level == ConstantCompression:
d.w.logNewTablePenalty = 8
d.w.logNewTablePenalty = 10
d.window = make([]byte, 32<<10)
d.fill = (*compressor).fillBlock
d.step = (*compressor).storeHuff
Expand Down
246 changes: 185 additions & 61 deletions flate/huffman_bit_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package flate

import (
"encoding/binary"
"fmt"
"io"
)

Expand All @@ -27,7 +28,7 @@ const (
// after which bytes are flushed to the writer.
// Should preferably be a multiple of 6, since
// we accumulate 6 bytes between writes to the buffer.
bufferFlushSize = 240
bufferFlushSize = 246

// bufferSize is the actual output byte buffer size.
// It must have additional headroom for a flush
Expand Down Expand Up @@ -59,19 +60,31 @@ var offsetExtraBits = [64]int8{
14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
}

var offsetBase = [64]uint32{
/* normal deflate */
0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
var offsetCombined = [32]uint32{}

/* extended window */
0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
0x100000, 0x180000, 0x200000, 0x300000,
func init() {
var offsetBase = [64]uint32{
/* normal deflate */
0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
0x001800, 0x002000, 0x003000, 0x004000, 0x006000,

/* extended window */
0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
0x100000, 0x180000, 0x200000, 0x300000,
}

for i := range offsetCombined[:] {
// Don't use extended window values...
if offsetBase[i] > 0x006000 {
continue
}
offsetCombined[i] = uint32(offsetExtraBits[i])<<16 | (offsetBase[i])
}
}

// The odd order in which the codegen code sizes are written.
Expand All @@ -88,15 +101,16 @@ type huffmanBitWriter struct {
bits uint64
nbits uint16
nbytes uint8
lastHuffMan bool
literalEncoding *huffmanEncoder
tmpLitEncoding *huffmanEncoder
offsetEncoding *huffmanEncoder
codegenEncoding *huffmanEncoder
err error
lastHeader int
// Set between 0 (reused block can be up to 2x the size)
logNewTablePenalty uint
lastHuffMan bool
bytes [256]byte
bytes [256 + 8]byte
literalFreq [lengthCodesStart + 32]uint16
offsetFreq [32]uint16
codegenFreq [codegenCodeCount]uint16
Expand Down Expand Up @@ -128,6 +142,7 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
return &huffmanBitWriter{
writer: w,
literalEncoding: newHuffmanEncoder(literalCount),
tmpLitEncoding: newHuffmanEncoder(literalCount),
codegenEncoding: newHuffmanEncoder(codegenCodeCount),
offsetEncoding: newHuffmanEncoder(offsetCodeCount),
}
Expand Down Expand Up @@ -745,9 +760,31 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
offs := oeCodes[:32]
lengths := leCodes[lengthCodesStart:]
lengths = lengths[:32]

// Go 1.16 LOVES having these on stack.
bits, nbits, nbytes := w.bits, w.nbits, w.nbytes

for _, t := range tokens {
if t < matchType {
w.writeCode(lits[t.literal()])
//w.writeCode(lits[t.literal()])
c := lits[t.literal()]
bits |= uint64(c.code) << nbits
nbits += c.len
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
bits >>= 48
nbits -= 48
nbytes += 6
if nbytes >= bufferFlushSize {
if w.err != nil {
nbytes = 0
return
}
_, w.err = w.writer.Write(w.bytes[:nbytes])
nbytes = 0
}
}
continue
}

Expand All @@ -759,38 +796,99 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
} else {
// inlined
c := lengths[lengthCode&31]
w.bits |= uint64(c.code) << w.nbits
w.nbits += c.len
if w.nbits >= 48 {
w.writeOutBits()
bits |= uint64(c.code) << nbits
nbits += c.len
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
bits >>= 48
nbits -= 48
nbytes += 6
if nbytes >= bufferFlushSize {
if w.err != nil {
nbytes = 0
return
}
_, w.err = w.writer.Write(w.bytes[:nbytes])
nbytes = 0
}
}
}

extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
if extraLengthBits > 0 {
//w.writeBits(extraLength, extraLengthBits)
extraLength := int32(length - lengthBase[lengthCode&31])
w.writeBits(extraLength, extraLengthBits)
bits |= uint64(extraLength) << nbits
nbits += extraLengthBits
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
bits >>= 48
nbits -= 48
nbytes += 6
if nbytes >= bufferFlushSize {
if w.err != nil {
nbytes = 0
return
}
_, w.err = w.writer.Write(w.bytes[:nbytes])
nbytes = 0
}
}
}
// Write the offset
offset := t.offset()
offsetCode := offsetCode(offset)
offsetCode := offset >> 16
offset &= matchOffsetOnlyMask
if false {
w.writeCode(offs[offsetCode&31])
} else {
// inlined
c := offs[offsetCode&31]
w.bits |= uint64(c.code) << w.nbits
w.nbits += c.len
if w.nbits >= 48 {
w.writeOutBits()
c := offs[offsetCode]
bits |= uint64(c.code) << nbits
nbits += c.len
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
bits >>= 48
nbits -= 48
nbytes += 6
if nbytes >= bufferFlushSize {
if w.err != nil {
nbytes = 0
return
}
_, w.err = w.writer.Write(w.bytes[:nbytes])
nbytes = 0
}
}
}
extraOffsetBits := uint16(offsetExtraBits[offsetCode&63])
if extraOffsetBits > 0 {
extraOffset := int32(offset - offsetBase[offsetCode&63])
w.writeBits(extraOffset, extraOffsetBits)
offsetComb := offsetCombined[offsetCode]
if offsetComb > 1<<16 {
//w.writeBits(extraOffset, extraOffsetBits)
bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << nbits
nbits += uint16(offsetComb >> 16)
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
bits >>= 48
nbits -= 48
nbytes += 6
if nbytes >= bufferFlushSize {
if w.err != nil {
nbytes = 0
return
}
_, w.err = w.writer.Write(w.bytes[:nbytes])
nbytes = 0
}
}
}
}
// Restore...
w.bits, w.nbits, w.nbytes = bits, nbits, nbytes

if deferEOB {
w.writeCode(leCodes[endBlockMarker])
}
Expand Down Expand Up @@ -825,47 +923,60 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
}
}

// Fill is rarely better...
const fill = false
const numLiterals = endBlockMarker + 1
const numOffsets = 1

// Add everything as literals
// We have to estimate the header size.
// Assume header is around 70 bytes:
// https://stackoverflow.com/a/25454430
const guessHeaderSizeBits = 70 * 8
estBits := histogramSize(input, w.literalFreq[:], !eof && !sync)
estBits += w.lastHeader + len(input)/32
histogram(input, w.literalFreq[:numLiterals], fill)
w.literalFreq[endBlockMarker] = 1
w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
if fill {
// Clear fill...
for i := range w.literalFreq[:numLiterals] {
w.literalFreq[i] = 0
}
histogram(input, w.literalFreq[:numLiterals], false)
}
estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
estBits += w.lastHeader
if w.lastHeader == 0 {
estBits += guessHeaderSizeBits
}
estBits += estBits >> w.logNewTablePenalty

// Store bytes, if we don't get a reasonable improvement.
ssize, storable := w.storedSize(input)
if storable && ssize < estBits {
if storable && ssize <= estBits {
w.writeStoredHeader(len(input), eof)
w.writeBytes(input)
return
}

reuseSize := 0
if w.lastHeader > 0 {
reuseSize = w.literalEncoding.bitLength(w.literalFreq[:256])
reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256])

if estBits < reuseSize {
if debugDeflate {
//fmt.Println("not reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
}
// We owe an EOB
w.writeCode(w.literalEncoding.codes[endBlockMarker])
w.lastHeader = 0
} else if debugDeflate {
fmt.Println("reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
}
}

const numLiterals = endBlockMarker + 1
const numOffsets = 1
count := 0
if w.lastHeader == 0 {
if !eof && !sync {
// Generate a slightly suboptimal tree that can be used for all.
fillHist(w.literalFreq[:numLiterals])
}
w.literalFreq[endBlockMarker] = 1
w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)

// Use the temp encoding, so swap.
w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding
// Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
Expand All @@ -876,34 +987,47 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
w.lastHuffMan = true
w.lastHeader, _ = w.headerSize()
if debugDeflate {
count += w.lastHeader
fmt.Println("header:", count/8)
}
}

encoding := w.literalEncoding.codes[:257]
encoding := w.literalEncoding.codes[:256]
// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
for _, t := range input {
// Bitwriting inlined, ~30% speedup
c := encoding[t]
w.bits |= uint64(c.code) << w.nbits
w.nbits += c.len
if w.nbits >= 48 {
bits := w.bits
w.bits >>= 48
w.nbits -= 48
n := w.nbytes
binary.LittleEndian.PutUint64(w.bytes[n:], bits)
n += 6
if n >= bufferFlushSize {
bits |= uint64(c.code) << nbits
nbits += c.len
if debugDeflate {
count += int(c.len)
}
if nbits >= 48 {
binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
bits >>= 48
nbits -= 48
nbytes += 6
if nbytes >= bufferFlushSize {
if w.err != nil {
n = 0
nbytes = 0
return
}
w.write(w.bytes[:n])
n = 0
_, w.err = w.writer.Write(w.bytes[:nbytes])
nbytes = 0
}
w.nbytes = n
}
}
// Restore...
w.bits, w.nbits, w.nbytes = bits, nbits, nbytes

if debugDeflate {
fmt.Println("wrote", count/8, "bytes")
}
if eof || sync {
w.writeCode(encoding[endBlockMarker])
w.writeCode(w.literalEncoding.codes[endBlockMarker])
w.lastHeader = 0
w.lastHuffMan = false
}
Expand Down
Loading

0 comments on commit 6274b7e

Please sign in to comment.