Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

s2: Add LZ4 block converter #748

Merged
merged 14 commits into from
Feb 17, 2023
Next Next commit
s2: Add LZ4 block converter
## Single threaded performance

Speed excluding LZ4 encoding:

```
BenchmarkLZ4Converter_ConvertBlock/html-32         	   22450	     49962 ns/op	2049.58 MB/s	       559.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/urls-32         	    1933	    608889 ns/op	1153.06 MB/s	     -3943 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/jpg-32          	  428572	      2816 ns/op	43715.74 MB/s	       482.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/jpg_200b-32     	  413655	      2793 ns/op	44075.51 MB/s	       482.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/pdf-32          	  163200	      7226 ns/op	14171.69 MB/s	       136.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/html4-32        	    5451	    214569 ns/op	1908.94 MB/s	      1840 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/txt1-32         	    5100	    233525 ns/op	 651.27 MB/s	       106.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/txt2-32         	    5997	    205362 ns/op	 609.55 MB/s	     -1427 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/txt3-32         	    1948	    616892 ns/op	 691.78 MB/s	       384.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/txt4-32         	    1419	    835555 ns/op	 576.70 MB/s	     -9125 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/pb-32           	   29461	     40528 ns/op	2926.08 MB/s	         1.000 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/gaviota-32      	    5454	    214264 ns/op	 860.25 MB/s	      9303 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/txt1_128b-32    	    5216	    233042 ns/op	 652.62 MB/s	       106.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/txt1_1000b-32   	    5000	    232401 ns/op	 654.43 MB/s	       106.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/txt1_10000b-32  	    4999	    232989 ns/op	 652.77 MB/s	       106.0 b_saved	       0 B/op	       0 allocs/op
BenchmarkLZ4Converter_ConvertBlock/txt1_20000b-32  	    4999	    233068 ns/op	 652.55 MB/s	       106.0 b_saved	       0 B/op	       0 allocs/op
```

Reference comnpression speed:

```
BenchmarkCompressBlockReference/html/default-32 	   14392	     82754 ns/op	1237.41 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/html/better-32  	    6663	    178830 ns/op	 572.61 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/urls/default-32 	    1174	   1002967 ns/op	 700.01 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/urls/better-32  	     607	   1942559 ns/op	 361.42 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/jpg/default-32  	  196694	      6023 ns/op	20436.73 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/jpg/better-32   	   51148	     23613 ns/op	5212.85 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/jpg_200b/default-32         	  197408	      6015 ns/op	20464.42 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/jpg_200b/better-32          	   51267	     23509 ns/op	5236.04 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/pdf/default-32              	   91663	     12891 ns/op	7943.73 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/pdf/better-32               	    8433	    128542 ns/op	 796.62 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/html4/default-32            	   10000	    100385 ns/op	4080.29 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/html4/better-32             	    5712	    204505 ns/op	2002.88 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1/default-32             	    3427	    340552 ns/op	 446.60 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1/better-32              	    2032	    576497 ns/op	 263.82 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt2/default-32             	    4359	    266102 ns/op	 470.42 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt2/better-32              	    2306	    509822 ns/op	 245.53 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt3/default-32             	    1219	    971355 ns/op	 439.34 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt3/better-32              	     748	   1598189 ns/op	 267.02 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt4/default-32             	    1056	   1129044 ns/op	 426.79 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt4/better-32              	     579	   2064813 ns/op	 233.37 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/pb/default-32               	   17960	     65621 ns/op	1807.16 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/pb/better-32                	    7062	    159908 ns/op	 741.60 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/gaviota/default-32          	    3870	    302586 ns/op	 609.15 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/gaviota/better-32           	    2396	    495081 ns/op	 372.30 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1_128b/default-32        	    3426	    342366 ns/op	 444.23 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1_128b/better-32         	    2066	    581844 ns/op	 261.39 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1_1000b/default-32       	    3524	    340442 ns/op	 446.74 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1_1000b/better-32        	    1996	    577981 ns/op	 263.14 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1_10000b/default-32      	    3423	    340195 ns/op	 447.06 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1_10000b/better-32       	    2013	    578202 ns/op	 263.04 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1_20000b/default-32      	    3518	    340101 ns/op	 447.19 MB/s	       0 B/op	       0 allocs/op
BenchmarkCompressBlockReference/txt1_20000b/better-32       	    2031	    581532 ns/op	 261.53 MB/s	       0 B/op	       0 allocs/op
```

Size comparisons (using Go lz4 encoder):

```
=== RUN   TestLZ4Converter_ConvertBlock
=== RUN   TestLZ4Converter_ConvertBlock/html
    lz4convert_test.go:34: input size: 102400
    lz4convert_test.go:35: lz4 size: 21195
    lz4convert_test.go:52: lz4->s2 size: 20636
    lz4convert_test.go:64: s2 (better) size: 18969
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 559
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 2226
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 1667
=== RUN   TestLZ4Converter_ConvertBlock/urls
    lz4convert_test.go:34: input size: 702087
    lz4convert_test.go:35: lz4 size: 292514
    lz4convert_test.go:52: lz4->s2 size: 296457
    lz4convert_test.go:64: s2 (better) size: 248076
    lz4convert_test.go:66: lz4 -> s2 bytes saved: -3943
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 44438
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 48381
=== RUN   TestLZ4Converter_ConvertBlock/jpg
    lz4convert_test.go:34: input size: 123093
    lz4convert_test.go:35: lz4 size: 123522
    lz4convert_test.go:52: lz4->s2 size: 123040
    lz4convert_test.go:64: s2 (better) size: 123097
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 482
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 425
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: -57
=== RUN   TestLZ4Converter_ConvertBlock/jpg_200b
    lz4convert_test.go:34: input size: 123093
    lz4convert_test.go:35: lz4 size: 123522
    lz4convert_test.go:52: lz4->s2 size: 123040
    lz4convert_test.go:64: s2 (better) size: 123097
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 482
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 425
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: -57
=== RUN   TestLZ4Converter_ConvertBlock/pdf
    lz4convert_test.go:34: input size: 102400
    lz4convert_test.go:35: lz4 size: 83152
    lz4convert_test.go:52: lz4->s2 size: 83016
    lz4convert_test.go:64: s2 (better) size: 82884
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 136
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 268
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 132
=== RUN   TestLZ4Converter_ConvertBlock/html4
    lz4convert_test.go:34: input size: 409600
    lz4convert_test.go:35: lz4 size: 81908
    lz4convert_test.go:52: lz4->s2 size: 80068
    lz4convert_test.go:64: s2 (better) size: 18979
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 1840
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 62929
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 61089
=== RUN   TestLZ4Converter_ConvertBlock/txt1
    lz4convert_test.go:34: input size: 152089
    lz4convert_test.go:35: lz4 size: 79672
    lz4convert_test.go:52: lz4->s2 size: 79566
    lz4convert_test.go:64: s2 (better) size: 71608
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 106
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 8064
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 7958
=== RUN   TestLZ4Converter_ConvertBlock/txt2
    lz4convert_test.go:34: input size: 125179
    lz4convert_test.go:35: lz4 size: 70801
    lz4convert_test.go:52: lz4->s2 size: 72228
    lz4convert_test.go:64: s2 (better) size: 65938
    lz4convert_test.go:66: lz4 -> s2 bytes saved: -1427
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 4863
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 6290
=== RUN   TestLZ4Converter_ConvertBlock/txt3
    lz4convert_test.go:34: input size: 426754
    lz4convert_test.go:35: lz4 size: 207038
    lz4convert_test.go:52: lz4->s2 size: 206654
    lz4convert_test.go:64: s2 (better) size: 184936
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 384
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 22102
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 21718
=== RUN   TestLZ4Converter_ConvertBlock/txt4
    lz4convert_test.go:34: input size: 481861
    lz4convert_test.go:35: lz4 size: 277731
    lz4convert_test.go:52: lz4->s2 size: 286856
    lz4convert_test.go:64: s2 (better) size: 264987
    lz4convert_test.go:66: lz4 -> s2 bytes saved: -9125
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 12744
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 21869
=== RUN   TestLZ4Converter_ConvertBlock/pb
    lz4convert_test.go:34: input size: 118588
    lz4convert_test.go:35: lz4 size: 19003
    lz4convert_test.go:52: lz4->s2 size: 19002
    lz4convert_test.go:64: s2 (better) size: 17686
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 1
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 1317
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 1316
=== RUN   TestLZ4Converter_ConvertBlock/gaviota
    lz4convert_test.go:34: input size: 184320
    lz4convert_test.go:35: lz4 size: 71749
    lz4convert_test.go:52: lz4->s2 size: 62446
    lz4convert_test.go:64: s2 (better) size: 55395
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 9303
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 16354
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 7051
=== RUN   TestLZ4Converter_ConvertBlock/txt1_128b
    lz4convert_test.go:34: input size: 152089
    lz4convert_test.go:35: lz4 size: 79672
    lz4convert_test.go:52: lz4->s2 size: 79566
    lz4convert_test.go:64: s2 (better) size: 71608
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 106
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 8064
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 7958
=== RUN   TestLZ4Converter_ConvertBlock/txt1_1000b
    lz4convert_test.go:34: input size: 152089
    lz4convert_test.go:35: lz4 size: 79672
    lz4convert_test.go:52: lz4->s2 size: 79566
    lz4convert_test.go:64: s2 (better) size: 71608
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 106
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 8064
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 7958
=== RUN   TestLZ4Converter_ConvertBlock/txt1_10000b
    lz4convert_test.go:34: input size: 152089
    lz4convert_test.go:35: lz4 size: 79672
    lz4convert_test.go:52: lz4->s2 size: 79566
    lz4convert_test.go:64: s2 (better) size: 71608
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 106
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 8064
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 7958
=== RUN   TestLZ4Converter_ConvertBlock/txt1_20000b
    lz4convert_test.go:34: input size: 152089
    lz4convert_test.go:35: lz4 size: 79672
    lz4convert_test.go:52: lz4->s2 size: 79566
    lz4convert_test.go:64: s2 (better) size: 71608
    lz4convert_test.go:66: lz4 -> s2 bytes saved: 106
    lz4convert_test.go:67: data -> s2 (better) bytes saved: 8064
    lz4convert_test.go:68: direct data -> s2 better compared to converted from lz4: 7958
```
  • Loading branch information
klauspost committed Feb 7, 2023
commit fe0c088e01a79e5d18f4c38cde1f00dc5d6cf11c
28 changes: 28 additions & 0 deletions internal/lz4ref/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Copyright (c) 2015, Pierre Curto
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

* Neither the name of xxHash nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

291 changes: 291 additions & 0 deletions internal/lz4ref/block.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
package lz4ref

import (
"encoding/binary"
"fmt"
"math/bits"
"sync"
)

const (
// The following constants are used to setup the compression algorithm.
minMatch = 4 // the minimum size of the match sequence size (4 bytes)
winSizeLog = 16 // LZ4 64Kb window size limit
winSize = 1 << winSizeLog
winMask = winSize - 1 // 64Kb window of previous data for dependent blocks

// hashLog determines the size of the hash table used to quickly find a previous match position.
// Its value influences the compression speed and memory usage, the lower the faster,
// but at the expense of the compression ratio.
// 16 seems to be the best compromise for fast compression.
hashLog = 16
htSize = 1 << hashLog

mfLimit = 10 + minMatch // The last match cannot start within the last 14 bytes.
)

// blockHash hashes the lower five bytes of x into a value < htSize.
func blockHash(x uint64) uint32 {
const prime6bytes = 227718039650203
x &= 1<<40 - 1
return uint32((x * prime6bytes) >> (64 - hashLog))
}

func CompressBlockBound(n int) int {
return n + n/255 + 16
}

type Compressor struct {
// Offsets are at most 64kiB, so we can store only the lower 16 bits of
// match positions: effectively, an offset from some 64kiB block boundary.
//
// When we retrieve such an offset, we interpret it as relative to the last
// block boundary si &^ 0xffff, or the one before, (si &^ 0xffff) - 0x10000,
// depending on which of these is inside the current window. If a table
// entry was generated more than 64kiB back in the input, we find out by
// inspecting the input stream.
table [htSize]uint16

// Bitmap indicating which positions in the table are in use.
// This allows us to quickly reset the table for reuse,
// without having to zero everything.
inUse [htSize / 32]uint32
}

// Get returns the position of a presumptive match for the hash h.
// The match may be a false positive due to a hash collision or an old entry.
// If si < winSize, the return value may be negative.
func (c *Compressor) get(h uint32, si int) int {
h &= htSize - 1
i := 0
if c.inUse[h/32]&(1<<(h%32)) != 0 {
i = int(c.table[h])
}
i += si &^ winMask
if i >= si {
// Try previous 64kiB block (negative when in first block).
i -= winSize
}
return i
}

func (c *Compressor) put(h uint32, si int) {
h &= htSize - 1
c.table[h] = uint16(si)
c.inUse[h/32] |= 1 << (h % 32)
}

func (c *Compressor) reset() { c.inUse = [htSize / 32]uint32{} }

var compressorPool = sync.Pool{New: func() interface{} { return new(Compressor) }}

func CompressBlock(src, dst []byte) (int, error) {
c := compressorPool.Get().(*Compressor)
n, err := c.CompressBlock(src, dst)
compressorPool.Put(c)
return n, err
}

func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
// Zero out reused table to avoid non-deterministic output (issue #65).
c.reset()

const debug = false

if debug {
fmt.Printf("lz4 block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
}

// Return 0, nil only if the destination buffer size is < CompressBlockBound.
isNotCompressible := len(dst) < CompressBlockBound(len(src))

// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
// This significantly speeds up incompressible data and usually has very small impact on compression.
// bytes to skip = 1 + (bytes since last match >> adaptSkipLog)
const adaptSkipLog = 7

// si: Current position of the search.
// anchor: Position of the current literals.
var si, di, anchor int
sn := len(src) - mfLimit
if sn <= 0 {
goto lastLiterals
}

// Fast scan strategy: the hash table only stores the last five-byte sequences.
for si < sn {
// Hash the next five bytes (sequence)...
match := binary.LittleEndian.Uint64(src[si:])
h := blockHash(match)
h2 := blockHash(match >> 8)

// We check a match at s, s+1 and s+2 and pick the first one we get.
// Checking 3 only requires us to load the source one.
ref := c.get(h, si)
ref2 := c.get(h2, si+1)
c.put(h, si)
c.put(h2, si+1)

offset := si - ref

if offset <= 0 || offset >= winSize || uint32(match) != binary.LittleEndian.Uint32(src[ref:]) {
// No match. Start calculating another hash.
// The processor can usually do this out-of-order.
h = blockHash(match >> 16)
ref3 := c.get(h, si+2)

// Check the second match at si+1
si += 1
offset = si - ref2

if offset <= 0 || offset >= winSize || uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
// No match. Check the third match at si+2
si += 1
offset = si - ref3
c.put(h, si)

if offset <= 0 || offset >= winSize || uint32(match>>16) != binary.LittleEndian.Uint32(src[ref3:]) {
// Skip one extra byte (at si+3) before we check 3 matches again.
si += 2 + (si-anchor)>>adaptSkipLog
continue
}
}
}

// Match found.
lLen := si - anchor // Literal length.
// We already matched 4 bytes.
mLen := 4

// Extend backwards if we can, reducing literals.
tOff := si - offset - 1
for lLen > 0 && tOff >= 0 && src[si-1] == src[tOff] {
si--
tOff--
lLen--
mLen++
}

// Add the match length, so we continue search at the end.
// Use mLen to store the offset base.
si, mLen = si+mLen, si+minMatch

// Find the longest match by looking by batches of 8 bytes.
for si+8 <= sn {
x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:])
if x == 0 {
si += 8
} else {
// Stop is first non-zero byte.
si += bits.TrailingZeros64(x) >> 3
break
}
}

mLen = si - mLen
if di >= len(dst) {
return 0, ErrInvalidSourceShortBuffer
}
if mLen < 0xF {
dst[di] = byte(mLen)
} else {
dst[di] = 0xF
}

// Encode literals length.
if debug {
fmt.Printf("emit %d literals\n", lLen)
}
if lLen < 0xF {
dst[di] |= byte(lLen << 4)
} else {
dst[di] |= 0xF0
di++
l := lLen - 0xF
for ; l >= 0xFF && di < len(dst); l -= 0xFF {
dst[di] = 0xFF
di++
}
if di >= len(dst) {
return 0, ErrInvalidSourceShortBuffer
}
dst[di] = byte(l)
}
di++

// Literals.
if di+lLen > len(dst) {
return 0, ErrInvalidSourceShortBuffer
}
copy(dst[di:di+lLen], src[anchor:anchor+lLen])
di += lLen + 2
anchor = si

// Encode offset.
if debug {
fmt.Printf("emit copy, length: %d, offset: %d\n", mLen+minMatch, offset)
}
if di > len(dst) {
return 0, ErrInvalidSourceShortBuffer
}
dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)

// Encode match length part 2.
if mLen >= 0xF {
for mLen -= 0xF; mLen >= 0xFF && di < len(dst); mLen -= 0xFF {
dst[di] = 0xFF
di++
}
if di >= len(dst) {
return 0, ErrInvalidSourceShortBuffer
}
dst[di] = byte(mLen)
di++
}
// Check if we can load next values.
if si >= sn {
break
}
// Hash match end-2
h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
c.put(h, si-2)
}

lastLiterals:
if isNotCompressible && anchor == 0 {
// Incompressible.
return 0, nil
}

// Last literals.
if di >= len(dst) {
return 0, ErrInvalidSourceShortBuffer
}
lLen := len(src) - anchor
if lLen < 0xF {
dst[di] = byte(lLen << 4)
} else {
dst[di] = 0xF0
di++
for lLen -= 0xF; lLen >= 0xFF && di < len(dst); lLen -= 0xFF {
dst[di] = 0xFF
di++
}
if di >= len(dst) {
return 0, ErrInvalidSourceShortBuffer
}
dst[di] = byte(lLen)
}
di++

// Write the last literals.
if isNotCompressible && di >= anchor {
// Incompressible.
return 0, nil
}
if di+len(src)-anchor > len(dst) {
return 0, ErrInvalidSourceShortBuffer
}
di += copy(dst[di:di+len(src)-anchor], src[anchor:])
return di, nil
}
9 changes: 9 additions & 0 deletions internal/lz4ref/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package lz4ref

type Error string

func (e Error) Error() string { return string(e) }

const (
ErrInvalidSourceShortBuffer Error = "lz4: invalid source or destination buffer too short"
)
25 changes: 14 additions & 11 deletions s2/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -2099,9 +2099,9 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir
// dst[i+3] = uint8(offset >> 16)
// dst[i+4] = uint8(offset >> 24)
tmp := GP64()
MOVB(U8(tagCopy4), tmp.As8())
XORL(tmp.As32(), tmp.As32())
// Use displacement to subtract 1 from upshifted length.
LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
LEAL(Mem{Base: tmp, Disp: -(1 << 2) | tagCopy4, Index: length, Scale: 4}, length.As32())
MOVB(length.As8(), Mem{Base: dstBase})
MOVL(offset.As32(), Mem{Base: dstBase, Disp: 1})
// return i + 5
Expand Down Expand Up @@ -2172,6 +2172,12 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir
JMP(LabelRef("two_byte_offset_" + name))

Label("two_byte_offset_short_" + name)

// Create a length * 4 as early as possible.
length4 := GP32()
MOVL(length.As32(), length4)
SHLL(U8(2), length4)

//if length >= 12 || offset >= 2048 {
CMPL(length.As32(), U8(12))
JGE(LabelRef("emit_copy_three_" + name))
Expand All @@ -2182,15 +2188,13 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir
// Emit the remaining copy, encoded as 2 bytes.
// dst[1] = uint8(offset)
// dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
tmp := GP64()
MOVB(U8(tagCopy1), tmp.As8())
// Use scale and displacement to shift and subtract values from length.
LEAL(Mem{Base: tmp, Index: length, Scale: 4, Disp: -(4 << 2)}, length.As32())
LEAL(Mem{Base: length4, Disp: -(4 << 2) | tagCopy1}, length4.As32())
MOVB(offset.As8(), Mem{Base: dstBase, Disp: 1}) // Store offset lower byte
SHRL(U8(8), offset.As32()) // Remove lower
SHLL(U8(5), offset.As32()) // Shift back up
ORL(offset.As32(), length.As32()) // OR result
MOVB(length.As8(), Mem{Base: dstBase, Disp: 0})
ORL(offset.As32(), length4.As32()) // OR result
MOVB(length4.As8(), Mem{Base: dstBase, Disp: 0})
if retval != nil {
ADDQ(U8(2), retval) // i += 2
}
Expand All @@ -2203,10 +2207,9 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir
// dst[2] = uint8(offset >> 8)
// dst[1] = uint8(offset)
// dst[0] = uint8(length-1)<<2 | tagCopy2
tmp = GP64()
MOVB(U8(tagCopy2), tmp.As8())
LEAL(Mem{Base: tmp, Disp: -(1 << 2), Index: length, Scale: 4}, length.As32())
MOVB(length.As8(), Mem{Base: dstBase})

LEAL(Mem{Base: length4, Disp: -(1 << 2) | tagCopy2}, length4.As32())
MOVB(length4.As8(), Mem{Base: dstBase})
MOVW(offset.As16(), Mem{Base: dstBase, Disp: 1})
// return 3
if retval != nil {
Expand Down
Loading