Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions flate/deflate.go
Original file line number Diff line number Diff line change
Expand Up @@ -644,21 +644,21 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
d.fill = (*compressor).fillBlock
d.step = (*compressor).store
case level == ConstantCompression:
d.w.logReusePenalty = uint(4)
d.w.logNewTablePenalty = 4
d.window = make([]byte, maxStoreBlockSize)
d.fill = (*compressor).fillBlock
d.step = (*compressor).storeHuff
case level == DefaultCompression:
level = 5
fallthrough
case level >= 1 && level <= 6:
d.w.logReusePenalty = uint(level + 1)
d.w.logNewTablePenalty = 6
d.fast = newFastEnc(level)
d.window = make([]byte, maxStoreBlockSize)
d.fill = (*compressor).fillBlock
d.step = (*compressor).storeFast
case 7 <= level && level <= 9:
d.w.logReusePenalty = uint(level)
d.w.logNewTablePenalty = 10
d.state = &advancedState{}
d.compressionLevel = levels[level]
d.initDeflate()
Expand Down
52 changes: 34 additions & 18 deletions flate/huffman_bit_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ type huffmanBitWriter struct {
err error
lastHeader int
// Set between 0 (reused block can be up to 2x the size)
logReusePenalty uint
lastHuffMan bool
bytes [256]byte
literalFreq [lengthCodesStart + 32]uint16
offsetFreq [32]uint16
codegenFreq [codegenCodeCount]uint16
logNewTablePenalty uint
lastHuffMan bool
bytes [256]byte
literalFreq [lengthCodesStart + 32]uint16
offsetFreq [32]uint16
codegenFreq [codegenCodeCount]uint16

// codegen must have an extra space for the final symbol.
codegen [literalCount + offsetCodeCount + 1]uint8
Expand All @@ -119,7 +119,7 @@ type huffmanBitWriter struct {
// If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid.
//
// An incoming block estimates the output size of a new table using a 'fresh' by calculating the
// optimal size and adding a penalty in 'logReusePenalty'.
// optimal size and adding a penalty in 'logNewTablePenalty'.
// A Huffman table is not optimal, which is why we add a penalty, and generating a new table
// is slower both for compression and decompression.

Expand Down Expand Up @@ -349,6 +349,13 @@ func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
int(w.codegenFreq[18])*7, numCodegens
}

// dynamicSize returns the size of dynamically encoded data in bits.
func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) {
size = litEnc.bitLength(w.literalFreq[:]) +
offEnc.bitLength(w.offsetFreq[:])
return size
}

// dynamicSize returns the size of dynamically encoded data in bits.
func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
header, numCodegens := w.headerSize()
Expand Down Expand Up @@ -451,12 +458,12 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n

i := 0
for {
var codeWord int = int(w.codegen[i])
var codeWord = uint32(w.codegen[i])
i++
if codeWord == badCode {
break
}
w.writeCode(w.codegenEncoding.codes[uint32(codeWord)])
w.writeCode(w.codegenEncoding.codes[codeWord])

switch codeWord {
case 16:
Expand Down Expand Up @@ -602,14 +609,14 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
var size int
// Check if we should reuse.
if w.lastHeader > 0 {
// Estimate size for using a new table
// Estimate size for using a new table.
// Use the previous header size as the best estimate.
newSize := w.lastHeader + tokens.EstimatedBits()
newSize += newSize >> w.logNewTablePenalty

// The estimated size is calculated as an optimal table.
// We add a penalty to make it more realistic and re-use a bit more.
newSize += newSize >> (w.logReusePenalty & 31)
extra := w.extraBitSize()
reuseSize, _ := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extra)
reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()

// Check if a new table is better.
if newSize < reuseSize {
Expand Down Expand Up @@ -801,21 +808,30 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
}

// Add everything as literals
estBits := histogramSize(input, w.literalFreq[:], !eof && !sync) + 15
// We have to estimate the header size.
// Assume header is around 70 bytes:
// https://stackoverflow.com/a/25454430
const guessHeaderSizeBits = 70 * 8
estBits, estExtra := histogramSize(input, w.literalFreq[:], !eof && !sync)
estBits += w.lastHeader + 15
if w.lastHeader == 0 {
estBits += guessHeaderSizeBits
}
estBits += estBits >> w.logNewTablePenalty

// Store bytes, if we don't get a reasonable improvement.
ssize, storable := w.storedSize(input)
if storable && ssize < (estBits+estBits>>4) {
if storable && ssize < estBits {
w.writeStoredHeader(len(input), eof)
w.writeBytes(input)
return
}

if w.lastHeader > 0 {
size, _ := w.dynamicSize(w.literalEncoding, huffOffset, w.lastHeader)
estBits += estBits >> (w.logReusePenalty)
reuseSize := w.literalEncoding.bitLength(w.literalFreq[:256])
estBits += estExtra

if estBits < size {
if estBits < reuseSize {
// We owe an EOB
w.writeCode(w.literalEncoding.codes[endBlockMarker])
w.lastHeader = 0
Expand Down
5 changes: 4 additions & 1 deletion flate/huffman_bit_writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ func TestBlockHuff(t *testing.T) {
if strings.HasSuffix(in, ".in") {
out = in[:len(in)-len(".in")] + ".golden"
}
testBlockHuff(t, in, out)
t.Run(in, func(t *testing.T) {
testBlockHuff(t, in, out)
})
}
}

Expand All @@ -45,6 +47,7 @@ func testBlockHuff(t *testing.T, in, out string) {
}
var buf bytes.Buffer
bw := newHuffmanBitWriter(&buf)
bw.logNewTablePenalty = 8
bw.writeBlockHuff(false, all, false)
bw.flush()
got := buf.Bytes()
Expand Down
42 changes: 30 additions & 12 deletions flate/huffman_code.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,26 +320,44 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
h.assignEncodingAndSize(bitCount, list)
}

func atLeastOne(v float32) float32 {
if v < 1 {
return 1
}
return v
}

// histogramSize accumulates a histogram of b in h.
// An estimated size in bits is returned.
// Unassigned values are assigned '1' in the histogram.
// len(h) must be >= 256, and h's elements must be all zeroes.
func histogramSize(b []byte, h []uint16, fill bool) int {
func histogramSize(b []byte, h []uint16, fill bool) (int, int) {
h = h[:256]
for _, t := range b {
h[t]++
}
invTotal := 1.0 / float64(len(b))
shannon := 0.0
single := math.Ceil(-math.Log2(invTotal))
for i, v := range h[:] {
if v > 0 {
n := float64(v)
shannon += math.Ceil(-math.Log2(n*invTotal) * n)
} else if fill {
shannon += single
h[i] = 1
invTotal := 1.0 / float32(len(b))
shannon := float32(0.0)
var extra float32
if fill {
oneBits := atLeastOne(-mFastLog2(invTotal))
for i, v := range h[:] {
if v > 0 {
n := float32(v)
shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
} else {
h[i] = 1
extra += oneBits
}
}
} else {
for _, v := range h[:] {
if v > 0 {
n := float32(v)
shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
}
}
}
return int(shannon + 0.99)

return int(shannon + 0.99), int(extra + 0.99)
}
Binary file modified flate/testdata/huffman-rand-limit.golden
Binary file not shown.
Binary file modified flate/testdata/huffman-text-shift.golden
Binary file not shown.
Binary file modified flate/testdata/huffman-text.golden
Binary file not shown.
42 changes: 25 additions & 17 deletions flate/token.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,7 @@ func (t *tokens) indexTokens(in []token) {
t.Reset()
for _, tok := range in {
if tok < matchType {
t.tokens[t.n] = tok
t.litHist[tok]++
t.n++
t.AddLiteral(tok.literal())
continue
}
t.AddMatch(uint32(tok.length()), tok.offset())
Expand All @@ -211,43 +209,53 @@ func (t *tokens) AddLiteral(lit byte) {
t.nLits++
}

// from https://stackoverflow.com/a/28730362
func mFastLog2(val float32) float32 {
ux := int32(math.Float32bits(val))
log2 := (float32)(((ux >> 23) & 255) - 128)
ux &= -0x7f800001
ux += 127 << 23
uval := math.Float32frombits(uint32(ux))
log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
return log2
}

// EstimatedBits will return an minimum size estimated by an *optimal*
// compression of the block.
// The size of the block
func (t *tokens) EstimatedBits() int {
shannon := float64(0)
shannon := float32(0)
bits := int(0)
nMatches := 0
if t.nLits > 0 {
invTotal := 1.0 / float64(t.nLits)
invTotal := 1.0 / float32(t.nLits)
for _, v := range t.litHist[:] {
if v > 0 {
n := float64(v)
shannon += math.Ceil(-math.Log2(n*invTotal) * n)
n := float32(v)
shannon += -mFastLog2(n*invTotal) * n
}
}
// Just add 15 for EOB
shannon += 15
for _, v := range t.extraHist[1 : literalCount-256] {
for i, v := range t.extraHist[1 : literalCount-256] {
if v > 0 {
n := float64(v)
shannon += math.Ceil(-math.Log2(n*invTotal) * n)
bits += int(lengthExtraBits[v&31]) * int(v)
n := float32(v)
shannon += -mFastLog2(n*invTotal) * n
bits += int(lengthExtraBits[i&31]) * int(v)
nMatches += int(v)
}
}
}
if nMatches > 0 {
invTotal := 1.0 / float64(nMatches)
for _, v := range t.offHist[:offsetCodeCount] {
invTotal := 1.0 / float32(nMatches)
for i, v := range t.offHist[:offsetCodeCount] {
if v > 0 {
n := float64(v)
shannon += math.Ceil(-math.Log2(n*invTotal) * n)
bits += int(offsetExtraBits[v&31]) * int(n)
n := float32(v)
shannon += -mFastLog2(n*invTotal) * n
bits += int(offsetExtraBits[i&31]) * int(v)
}
}
}

return int(shannon) + bits
}

Expand Down
14 changes: 12 additions & 2 deletions flate/token_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package flate

import (
"bytes"
"io/ioutil"
"testing"
)
Expand All @@ -27,8 +28,17 @@ func loadTestTokens(t testFatal) *tokens {
func Test_tokens_EstimatedBits(t *testing.T) {
tok := loadTestTokens(t)
// The estimated size, update if method changes.
const expect = 199380
if n := tok.EstimatedBits(); n != expect {
const expect = 221057
n := tok.EstimatedBits()
var buf bytes.Buffer
wr := newHuffmanBitWriter(&buf)
wr.writeBlockDynamic(tok, true, nil, true)
if wr.err != nil {
t.Fatal(wr.err)
}
wr.flush()
t.Log("got:", n, "actual:", buf.Len()*8, "(header not part of estimate)")
if n != expect {
t.Error("want:", expect, "bits, got:", n)
}
}
Expand Down