Skip to content
Merged
126 changes: 94 additions & 32 deletions s2/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,12 @@ func main() {

o.outputMargin = 6
o.maxSkip = 100 // Blocks can be long, limit max skipping.
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 17, 14, 7, 7, limit14B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 17, 14, 7, 7, 4<<20)
o.maxSkip = 0
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 12, 6, 6, limit12B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 10, 5, 6, limit10B)
o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 8, 4, 6, limit8B)

// Snappy compatible
o.snappy = true
Expand All @@ -76,12 +76,12 @@ func main() {
o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B)

o.maxSkip = 100
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 16, 7, 7, limit14B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 17, 14, 7, 7, limit14B)
o.maxSkip = 0
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 7, 7, 64<<10-1)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 6, 6, limit12B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 5, 6, limit10B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 4, 6, limit8B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 14, 7, 7, 64<<10-1)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 12, 6, 6, limit12B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 10, 5, 6, limit10B)
o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 8, 4, 6, limit8B)

o.snappy = false
o.outputMargin = 0
Expand Down Expand Up @@ -785,7 +785,7 @@ func maxLitOverheadFor(n int) int {
return 5
}

func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) {
func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, skipLog, lHashBytes, maxLen int) {
TEXT(name, 0, "func(dst, src []byte) int")
Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
fmt.Sprintf("Maximum input %d bytes.", maxLen),
Expand All @@ -797,7 +797,6 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
}
var literalMaxOverhead = maxLitOverheadFor(maxLen)

var sTableBits = lTableBits - 2
const sHashBytes = 4
o.maxLen = maxLen

Expand Down Expand Up @@ -998,10 +997,34 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
MOVL(s, sTab.Idx(hash1, 4))
}

longVal := GP64()
shortVal := GP64()
MOVQ(Mem{Base: src, Index: candidate, Scale: 1}, longVal)
MOVQ(Mem{Base: src, Index: candidateS, Scale: 1}, shortVal)

// If we have at least 8 bytes match, choose that first.
CMPQ(longVal, cv.As64())
JEQ(LabelRef("candidate_match_" + name))

CMPQ(shortVal, cv.As64())
JNE(LabelRef("no_short_found_" + name))
MOVL(candidateS.As32(), candidate.As32())
JMP(LabelRef("candidate_match_" + name))

Label("no_short_found_" + name)
MOVL(longVal.As32(), longVal.As32())

// En/disable repeat matching.
// Too small improvement
if false {
{
CMPL(repeatL, U8(0))
JEQ(LabelRef("no_repeat_found_" + name))
}
// Check repeat at offset checkRep
const checkRep = 1
const wantRepeatBytes = 6
const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
{
// rep = s - repeat
rep := GP32()
Expand All @@ -1010,10 +1033,13 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash

// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
left, right := GP64(), GP64()
MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32())
MOVQ(Mem{Base: src, Index: rep, Disp: 0, Scale: 1}, right.As64())
MOVQ(cv, left)
SHRQ(U8(checkRep*8), left)
CMPL(left.As32(), right.As32())
tmp := GP64()
MOVQ(U64(repeatMask), tmp)
ANDQ(tmp, left)
ANDQ(tmp, right)
CMPQ(left.As64(), right.As64())
// BAIL, no repeat.
JNE(LabelRef("no_repeat_found_" + name))
}
Expand Down Expand Up @@ -1057,7 +1083,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
// Extend forward
{
// s += 4 + checkRep
ADDL(U8(4+checkRep), s)
ADDL(U8(wantRepeatBytes+checkRep), s)

if true {
// candidate := s - repeat + 4 + checkRep
Expand Down Expand Up @@ -1097,18 +1123,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
offsetVal := GP32()
MOVL(repeatL, offsetVal)

if !o.snappy {
// if nextEmit == 0 {do copy instead...}
TESTL(nextEmit, nextEmit)
JZ(LabelRef("repeat_as_copy_" + name))

// Emit as repeat...
o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), false)

// Emit as copy instead...
Label("repeat_as_copy_" + name)
}
o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
// Emit as repeat...
o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), false)

Label("repeat_end_emit_" + name)
// Store new dst and nextEmit
Expand Down Expand Up @@ -1145,11 +1161,11 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
JG(ok)
})

CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
CMPL(longVal.As32(), cv.As32())
JEQ(LabelRef("candidate_match_" + name))

//if uint32(cv) == load32(src, candidateS)
CMPL(Mem{Base: src, Index: candidateS, Scale: 1}, cv.As32())
CMPL(shortVal.As32(), cv.As32())
JEQ(LabelRef("candidateS_match_" + name))

// No match found, next loop
Expand Down Expand Up @@ -1338,11 +1354,57 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
}
}
Label("match_nolit_dst_ok_" + name)
// cv must be set to value at base+1 before arriving here
if true {
lHasher := hashN(lHashBytes, lTableBits)
sHasher := hashN(sHashBytes, sTableBits)

index0, index1 := GP64(), GP64()
// index0 := base + 1
LEAQ(Mem{Base: base, Disp: 1}, index0)
// index1 := s - 2
LEAQ(Mem{Base: s, Disp: -2}, index1)
hash0l, hash0s, hash1l, hash1s := GP64(), GP64(), GP64(), GP64()
MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 0}, hash0l)
MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 1}, hash0s)
MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 0}, hash1l)
MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 1}, hash1s)

lHasher.hash(hash0l)
sHasher.hash(hash0s)
lHasher.hash(hash1l)
sHasher.hash(hash1s)

plusone0, plusone1 := GP64(), GP64()
LEAQ(Mem{Base: index0, Disp: 1}, plusone0)
LEAQ(Mem{Base: index1, Disp: 1}, plusone1)
MOVL(index0.As32(), lTab.Idx(hash0l, 4))
MOVL(index1.As32(), lTab.Idx(hash1l, 4))
MOVL(plusone0.As32(), sTab.Idx(hash0s, 4))
MOVL(plusone1.As32(), sTab.Idx(hash1s, 4))

ADDQ(U8(1), index0)
SUBQ(U8(1), index1)

Label("index_loop_" + name)
CMPQ(index0, index1)
JAE(LabelRef("search_loop_" + name))
hash0l, hash1l = GP64(), GP64()
MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 0}, hash0l)
MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 0}, hash1l)

lHasher.hash(hash0l)
lHasher.hash(hash1l)

MOVL(index0.As32(), lTab.Idx(hash0l, 4))
MOVL(index1.As32(), lTab.Idx(hash1l, 4))

ADDQ(U8(2), index0)
SUBQ(U8(2), index1)
JMP(LabelRef("index_loop_" + name))
} else {
lHasher := hashN(lHashBytes, lTableBits)
sHasher := hashN(sHashBytes, sTableBits)

// Index base+1 long, base+2 short...
cv := GP64()
INCL(base)
Expand Down Expand Up @@ -1412,8 +1474,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
MOVL(sm2, lTab.Idx(hash0, 4))
MOVL(sm1, sTab.Idx(hash1, 4))
MOVL(sm1, lTab.Idx(hash3, 4))
JMP(LabelRef("search_loop_" + name))
}
JMP(LabelRef("search_loop_" + name))

Label("emit_remainder_" + name)
// Bail if we exceed the maximum size.
Expand Down
99 changes: 72 additions & 27 deletions s2/encode_better.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
// Initialize the hash tables.
const (
// Long hash matches.
lTableBits = 16
lTableBits = 17
maxLTableSize = 1 << lTableBits

// Short hash matches.
Expand Down Expand Up @@ -98,9 +98,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
lTable[hashL] = uint32(s)
sTable[hashS] = uint32(s)

valLong := load64(src, candidateL)
valShort := load64(src, candidateS)

// If long matches at least 8 bytes, use that.
if cv == valLong {
break
}
if cv == valShort {
candidateL = candidateS
break
}

// Check repeat at offset checkRep.
const checkRep = 1
if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
// Minimum length of a repeat. Tested with various values.
// While 4-5 offers improvements in some, 6 reduces
// regressions significantly.
const wantRepeatBytes = 6
const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
base := s + checkRep
// Extend back
for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
Expand All @@ -110,8 +127,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
d += emitLiteral(dst[d:], src[nextEmit:base])

// Extend forward
candidate := s - repeat + 4 + checkRep
s += 4 + checkRep
candidate := s - repeat + wantRepeatBytes + checkRep
s += wantRepeatBytes + checkRep
for s < len(src) {
if len(src)-s < 8 {
if src[s] == src[candidate] {
Expand All @@ -128,28 +145,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
s += 8
candidate += 8
}
if nextEmit > 0 {
// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
d += emitRepeat(dst[d:], repeat, s-base)
} else {
// First match, cannot be repeat.
d += emitCopy(dst[d:], repeat, s-base)
}
// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
d += emitRepeat(dst[d:], repeat, s-base)
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
// Index in-between
index0 := base + 1
index1 := s - 2

cv = load64(src, s)
for index0 < index1 {
cv0 := load64(src, index0)
cv1 := load64(src, index1)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)

lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
index0 += 2
index1 -= 2
}

cv = load64(src, s)
continue
}

if uint32(cv) == load32(src, candidateL) {
// Long likely matches 7, so take that.
if uint32(cv) == uint32(valLong) {
break
}

// Check our short candidate
if uint32(cv) == load32(src, candidateS) {
if uint32(cv) == uint32(valShort) {
// Try a long candidate at s+1
hashL = hash7(cv>>8, lTableBits)
candidateL = int(lTable[hashL])
Expand Down Expand Up @@ -228,21 +257,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
// Do we have space for more, if not bail.
return 0
}
// Index match start+1 (long) and start+2 (short)

// Index short & long
index0 := base + 1
// Index match end-2 (long) and end-1 (short)
index1 := s - 2

cv0 := load64(src, index0)
cv1 := load64(src, index1)
cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
lTable[hash7(cv1, lTableBits)] = uint32(index1)
lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)

lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
index0 += 1
index1 -= 1
cv = load64(src, s)

// index every second long in between.
for index0 < index1 {
lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
index0 += 2
index1 -= 2
}
}

emitRemainder:
Expand Down Expand Up @@ -404,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
// Do we have space for more, if not bail.
return 0
}
// Index match start+1 (long) and start+2 (short)

// Index short & long
index0 := base + 1
// Index match end-2 (long) and end-1 (short)
index1 := s - 2

cv0 := load64(src, index0)
cv1 := load64(src, index1)
cv = load64(src, s)
lTable[hash7(cv0, lTableBits)] = uint32(index0)
lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
lTable[hash7(cv1, lTableBits)] = uint32(index1)
lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)

lTable[hash7(cv1, lTableBits)] = uint32(index1)
sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
index0 += 1
index1 -= 1
cv = load64(src, s)

// index every second long in between.
for index0 < index1 {
lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
index0 += 2
index1 -= 2
}
}

emitRemainder:
Expand Down
Loading