klauspost · klauspost · Sep 8, 2022 · Jun 15, 2022 · Jun 15, 2022 · Jun 20, 2022
diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go
@@ -59,12 +59,12 @@ func main() {
 
 	o.outputMargin = 6
 	o.maxSkip = 100 // Blocks can be long, limit max skipping.
-	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 16, 7, 7, limit14B)
-	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 16, 7, 7, 4<<20)
+	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm", 17, 14, 7, 7, limit14B)
+	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm4MB", 17, 14, 7, 7, 4<<20)
 	o.maxSkip = 0
-	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 6, 6, limit12B)
-	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 5, 6, limit10B)
-	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 4, 6, limit8B)
+	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm12B", 14, 12, 6, 6, limit12B)
+	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm10B", 12, 10, 5, 6, limit10B)
+	o.genEncodeBetterBlockAsm("encodeBetterBlockAsm8B", 10, 8, 4, 6, limit8B)
 
 	// Snappy compatible
 	o.snappy = true
@@ -76,12 +76,12 @@ func main() {
 	o.genEncodeBlockAsm("encodeSnappyBlockAsm8B", 8, 4, 4, limit8B)
 
 	o.maxSkip = 100
-	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 16, 7, 7, limit14B)
+	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm", 17, 14, 7, 7, limit14B)
 	o.maxSkip = 0
-	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 7, 7, 64<<10-1)
-	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 6, 6, limit12B)
-	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 5, 6, limit10B)
-	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 4, 6, limit8B)
+	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm64K", 16, 14, 7, 7, 64<<10-1)
+	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm12B", 14, 12, 6, 6, limit12B)
+	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm10B", 12, 10, 5, 6, limit10B)
+	o.genEncodeBetterBlockAsm("encodeSnappyBetterBlockAsm8B", 10, 8, 4, 6, limit8B)
 
 	o.snappy = false
 	o.outputMargin = 0
@@ -785,7 +785,7 @@ func maxLitOverheadFor(n int) int {
 	return 5
 }
 
-func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHashBytes, maxLen int) {
+func (o options) genEncodeBetterBlockAsm(name string, lTableBits, sTableBits, skipLog, lHashBytes, maxLen int) {
 	TEXT(name, 0, "func(dst, src []byte) int")
 	Doc(name+" encodes a non-empty src to a guaranteed-large-enough dst.",
 		fmt.Sprintf("Maximum input %d bytes.", maxLen),
@@ -797,7 +797,6 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
 	}
 	var literalMaxOverhead = maxLitOverheadFor(maxLen)
 
-	var sTableBits = lTableBits - 2
 	const sHashBytes = 4
 	o.maxLen = maxLen
 
@@ -998,10 +997,34 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
 			MOVL(s, sTab.Idx(hash1, 4))
 		}
 
+		longVal := GP64()
+		shortVal := GP64()
+		MOVQ(Mem{Base: src, Index: candidate, Scale: 1}, longVal)
+		MOVQ(Mem{Base: src, Index: candidateS, Scale: 1}, shortVal)
+
+		// If we have at least 8 bytes match, choose that first.
+		CMPQ(longVal, cv.As64())
+		JEQ(LabelRef("candidate_match_" + name))
+
+		CMPQ(shortVal, cv.As64())
+		JNE(LabelRef("no_short_found_" + name))
+		MOVL(candidateS.As32(), candidate.As32())
+		JMP(LabelRef("candidate_match_" + name))
+
+		Label("no_short_found_" + name)
+		MOVL(longVal.As32(), longVal.As32())
+
 		// En/disable repeat matching.
+		// Too small improvement
 		if false {
+			{
+				CMPL(repeatL, U8(0))
+				JEQ(LabelRef("no_repeat_found_" + name))
+			}
 			// Check repeat at offset checkRep
 			const checkRep = 1
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
 			{
 				// rep = s - repeat
 				rep := GP32()
@@ -1010,10 +1033,13 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
 
 				// if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
 				left, right := GP64(), GP64()
-				MOVL(Mem{Base: src, Index: rep, Disp: checkRep, Scale: 1}, right.As32())
+				MOVQ(Mem{Base: src, Index: rep, Disp: 0, Scale: 1}, right.As64())
 				MOVQ(cv, left)
-				SHRQ(U8(checkRep*8), left)
-				CMPL(left.As32(), right.As32())
+				tmp := GP64()
+				MOVQ(U64(repeatMask), tmp)
+				ANDQ(tmp, left)
+				ANDQ(tmp, right)
+				CMPQ(left.As64(), right.As64())
 				// BAIL, no repeat.
 				JNE(LabelRef("no_repeat_found_" + name))
 			}
@@ -1057,7 +1083,7 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
 			// Extend forward
 			{
 				// s += 4 + checkRep
-				ADDL(U8(4+checkRep), s)
+				ADDL(U8(wantRepeatBytes+checkRep), s)
 
 				if true {
 					// candidate := s - repeat + 4 + checkRep
@@ -1097,18 +1123,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
 				offsetVal := GP32()
 				MOVL(repeatL, offsetVal)
 
-				if !o.snappy {
-					// if nextEmit == 0 {do copy instead...}
-					TESTL(nextEmit, nextEmit)
-					JZ(LabelRef("repeat_as_copy_" + name))
-
-					// Emit as repeat...
-					o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), false)
-
-					// Emit as copy instead...
-					Label("repeat_as_copy_" + name)
-				}
-				o.emitCopy("repeat_as_copy_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name))
+				// Emit as repeat...
+				o.emitRepeat("match_repeat_"+name, length, offsetVal, nil, dst, LabelRef("repeat_end_emit_"+name), false)
 
 				Label("repeat_end_emit_" + name)
 				// Store new dst and nextEmit
@@ -1145,11 +1161,11 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
 				JG(ok)
 			})
 
-			CMPL(Mem{Base: src, Index: candidate, Scale: 1}, cv.As32())
+			CMPL(longVal.As32(), cv.As32())
 			JEQ(LabelRef("candidate_match_" + name))
 
 			//if uint32(cv) == load32(src, candidateS)
-			CMPL(Mem{Base: src, Index: candidateS, Scale: 1}, cv.As32())
+			CMPL(shortVal.As32(), cv.As32())
 			JEQ(LabelRef("candidateS_match_" + name))
 
 			// No match found, next loop
@@ -1338,11 +1354,57 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
 		}
 	}
 	Label("match_nolit_dst_ok_" + name)
-	// cv must be set to value at base+1 before arriving here
 	if true {
 		lHasher := hashN(lHashBytes, lTableBits)
 		sHasher := hashN(sHashBytes, sTableBits)
 
+		index0, index1 := GP64(), GP64()
+		// index0 := base + 1
+		LEAQ(Mem{Base: base, Disp: 1}, index0)
+		// index1 := s - 2
+		LEAQ(Mem{Base: s, Disp: -2}, index1)
+		hash0l, hash0s, hash1l, hash1s := GP64(), GP64(), GP64(), GP64()
+		MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 0}, hash0l)
+		MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 1}, hash0s)
+		MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 0}, hash1l)
+		MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 1}, hash1s)
+
+		lHasher.hash(hash0l)
+		sHasher.hash(hash0s)
+		lHasher.hash(hash1l)
+		sHasher.hash(hash1s)
+
+		plusone0, plusone1 := GP64(), GP64()
+		LEAQ(Mem{Base: index0, Disp: 1}, plusone0)
+		LEAQ(Mem{Base: index1, Disp: 1}, plusone1)
+		MOVL(index0.As32(), lTab.Idx(hash0l, 4))
+		MOVL(index1.As32(), lTab.Idx(hash1l, 4))
+		MOVL(plusone0.As32(), sTab.Idx(hash0s, 4))
+		MOVL(plusone1.As32(), sTab.Idx(hash1s, 4))
+
+		ADDQ(U8(1), index0)
+		SUBQ(U8(1), index1)
+
+		Label("index_loop_" + name)
+		CMPQ(index0, index1)
+		JAE(LabelRef("search_loop_" + name))
+		hash0l, hash1l = GP64(), GP64()
+		MOVQ(Mem{Base: src, Index: index0, Scale: 1, Disp: 0}, hash0l)
+		MOVQ(Mem{Base: src, Index: index1, Scale: 1, Disp: 0}, hash1l)
+
+		lHasher.hash(hash0l)
+		lHasher.hash(hash1l)
+
+		MOVL(index0.As32(), lTab.Idx(hash0l, 4))
+		MOVL(index1.As32(), lTab.Idx(hash1l, 4))
+
+		ADDQ(U8(2), index0)
+		SUBQ(U8(2), index1)
+		JMP(LabelRef("index_loop_" + name))
+	} else {
+		lHasher := hashN(lHashBytes, lTableBits)
+		sHasher := hashN(sHashBytes, sTableBits)
+
 		// Index base+1 long, base+2 short...
 		cv := GP64()
 		INCL(base)
@@ -1412,8 +1474,8 @@ func (o options) genEncodeBetterBlockAsm(name string, lTableBits, skipLog, lHash
 		MOVL(sm2, lTab.Idx(hash0, 4))
 		MOVL(sm1, sTab.Idx(hash1, 4))
 		MOVL(sm1, lTab.Idx(hash3, 4))
+		JMP(LabelRef("search_loop_" + name))
 	}
-	JMP(LabelRef("search_loop_" + name))
 
 	Label("emit_remainder_" + name)
 	// Bail if we exceed the maximum size.

diff --git a/s2/encode_better.go b/s2/encode_better.go
@@ -57,7 +57,7 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 	// Initialize the hash tables.
 	const (
 		// Long hash matches.
-		lTableBits    = 16
+		lTableBits    = 17
 		maxLTableSize = 1 << lTableBits
 
 		// Short hash matches.
@@ -98,9 +98,26 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 			lTable[hashL] = uint32(s)
 			sTable[hashS] = uint32(s)
 
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
 			// Check repeat at offset checkRep.
 			const checkRep = 1
-			if false && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+			// Minimum length of a repeat. Tested with various values.
+			// While 4-5 offers improvements in some, 6 reduces
+			// regressions significantly.
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
 				base := s + checkRep
 				// Extend back
 				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
@@ -110,8 +127,8 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 				d += emitLiteral(dst[d:], src[nextEmit:base])
 
 				// Extend forward
-				candidate := s - repeat + 4 + checkRep
-				s += 4 + checkRep
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
 				for s < len(src) {
 					if len(src)-s < 8 {
 						if src[s] == src[candidate] {
@@ -128,28 +145,40 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 					s += 8
 					candidate += 8
 				}
-				if nextEmit > 0 {
-					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
-					d += emitRepeat(dst[d:], repeat, s-base)
-				} else {
-					// First match, cannot be repeat.
-					d += emitCopy(dst[d:], repeat, s-base)
-				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
 				nextEmit = s
 				if s >= sLimit {
 					goto emitRemainder
 				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				cv = load64(src, s)
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint32(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint32(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
 
 				cv = load64(src, s)
 				continue
 			}
 
-			if uint32(cv) == load32(src, candidateL) {
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
 				break
 			}
 
 			// Check our short candidate
-			if uint32(cv) == load32(src, candidateS) {
+			if uint32(cv) == uint32(valShort) {
 				// Try a long candidate at s+1
 				hashL = hash7(cv>>8, lTableBits)
 				candidateL = int(lTable[hashL])
@@ -228,21 +257,29 @@ func encodeBlockBetterGo(dst, src []byte) (d int) {
 			// Do we have space for more, if not bail.
 			return 0
 		}
-		// Index match start+1 (long) and start+2 (short)
+
+		// Index short & long
 		index0 := base + 1
-		// Index match end-2 (long) and end-1 (short)
 		index1 := s - 2
 
 		cv0 := load64(src, index0)
 		cv1 := load64(src, index1)
-		cv = load64(src, s)
 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
-		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
-		lTable[hash7(cv1, lTableBits)] = uint32(index1)
-		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
-		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// index every second long in between.
+		for index0 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+			index0 += 2
+			index1 -= 2
+		}
 	}
 
 emitRemainder:
@@ -404,21 +441,29 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
 			// Do we have space for more, if not bail.
 			return 0
 		}
-		// Index match start+1 (long) and start+2 (short)
+
+		// Index short & long
 		index0 := base + 1
-		// Index match end-2 (long) and end-1 (short)
 		index1 := s - 2
 
 		cv0 := load64(src, index0)
 		cv1 := load64(src, index1)
-		cv = load64(src, s)
 		lTable[hash7(cv0, lTableBits)] = uint32(index0)
-		lTable[hash7(cv0>>8, lTableBits)] = uint32(index0 + 1)
-		lTable[hash7(cv1, lTableBits)] = uint32(index1)
-		lTable[hash7(cv1>>8, lTableBits)] = uint32(index1 + 1)
 		sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
-		sTable[hash4(cv0>>16, sTableBits)] = uint32(index0 + 2)
+
+		lTable[hash7(cv1, lTableBits)] = uint32(index1)
 		sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// index every second long in between.
+		for index0 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
+			lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
+			index0 += 2
+			index1 -= 2
+		}
 	}
 
 emitRemainder: