Use hash.adler32 instead of sha1's first 4 bytes

Julien "_FrnchFrgg_" RIVAUD · Julien "_FrnchFrgg_" RIVAUD · commit 5a37dcb61900 · 2019-11-05T19:37:36.000+01:00
It should avoid even more allocations, and be faster too.
Also create a custom type for hash results for clarity and ease of
change if we need to update the hash.
diff --git a/difflib/difflib.go b/difflib/difflib.go
@@ -23,7 +23,7 @@ import (
 	"io"
 	"strings"
 	"unicode"
-	"crypto/sha1"
+	"hash/adler32"
 )
 
 func min(a, b int) int {
@@ -69,22 +69,23 @@ type OpCode struct {
 	J2  int
 }
 
-func _hash(line []byte) int32 {
-	h := sha1.Sum(line)
-	return int32(h[0]) << 3*8 + int32(h[1]) << 2*8 + int32(h[2]) << 1*8 + int32(h[3])
+type lineHash uint32
+
+func _hash(line []byte) lineHash {
+	return lineHash(adler32.Checksum(line))
 }
 
 // This is essentially a map from lines to line numbers, so that later it can
 // be made a bit cleverer than the standard map in that it will not need to
 // store copies of the lines.
 // It needs to hold a reference to the underlying slice of lines.
 type B2J struct {
-	store map[int32] [][]int
+	store map[lineHash] [][]int
 	b [][]byte
 }
 
 func newB2J (b [][]byte) *B2J {
-	b2j := B2J{store: map[int32] [][]int{}, b: b}
+	b2j := B2J{store: map[lineHash] [][]int{}, b: b}
 	for lineno, line := range b {
 		h := _hash(line)
 		// Thanks to the qualities of sha1, the probability of having more than
@@ -129,7 +130,7 @@ func (b2j *B2J) delete(line []byte) {
 	}
 }
 
-func (b2j *B2J) deleteHash(h int32) {
+func (b2j *B2J) deleteHash(h lineHash) {
 	delete(b2j.store, h)
 }
 
@@ -173,9 +174,9 @@ type SequenceMatcher struct {
 	b2j            B2J
 	IsJunk         func([]byte) bool
 	autoJunk       bool
-	bJunk          map[int32]struct{}
+	bJunk          map[lineHash]struct{}
 	matchingBlocks []Match
-	fullBCount     map[int32]int
+	fullBCount     map[lineHash]int
 	bPopular       []int
 	opCodes        []OpCode
 }
@@ -236,7 +237,7 @@ func (m *SequenceMatcher) chainB() {
 	b2j := *newB2J(m.b)
 
 	// Purge junk elements
-	m.bJunk = map[int32]struct{}{}
+	m.bJunk = map[lineHash]struct{}{}
 	if m.IsJunk != nil {
 		junk := m.bJunk
 		b2j.iter(func (s []byte, _ []int){
@@ -560,7 +561,7 @@ func (m *SequenceMatcher) QuickRatio() float64 {
 	// greater due hash collisions incurring false positives, but
 	// we don't care because we want an upper bound anyway.
 	if m.fullBCount == nil {
-		m.fullBCount = map[int32]int{}
+		m.fullBCount = map[lineHash]int{}
 		for _, s := range m.b {
 			h := _hash(s)
 			m.fullBCount[h] = m.fullBCount[h] + 1
@@ -569,7 +570,7 @@ func (m *SequenceMatcher) QuickRatio() float64 {
 
 	// avail[x] is the number of times x appears in 'b' less the
 	// number of times we've seen it in 'a' so far ... kinda
-	avail := map[int32]int{}
+	avail := map[lineHash]int{}
 	matches := 0
 	for _, s := range m.a {
 		h := _hash(s)
diff --git a/difflib/difflib_test.go b/difflib/difflib_test.go
@@ -264,18 +264,18 @@ func TestWithAsciiBJunk(t *testing.T) {
 	}
 	sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)
-	assertEqual(t, sm.bJunk, map[int32]struct{}{})
+	assertEqual(t, sm.bJunk, map[lineHash]struct{}{})
 
 	sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
-	assertEqual(t, sm.bJunk, map[int32]struct{}{_hash(SPACE): struct{}{}})
+	assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}})
 
 	isJunk = func(s []byte) bool {
 		return len(s) == 1 && (s[0] == ' ' || s[0] == 'b')
 	}
 	sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
 		splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
-	assertEqual(t, sm.bJunk, map[int32]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})
+	assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})
 }
 
 func TestSFBugsRatioForNullSeqn(t *testing.T) {

Original file line number	Diff line number	Diff line change
`@@ -264,18 +264,18 @@ func TestWithAsciiBJunk(t *testing.T) {`
`264`	`264`	`}`
`265`	`265`	`sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),`
`266`	`266`	`splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)`
`267`		`- assertEqual(t, sm.bJunk, map[int32]struct{}{})`
	`267`	`+ assertEqual(t, sm.bJunk, map[lineHash]struct{}{})`
`268`	`268`
`269`	`269`	`sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),`
`270`	`270`	`splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)`
`271`		`- assertEqual(t, sm.bJunk, map[int32]struct{}{_hash(SPACE): struct{}{}})`
	`271`	`+ assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}})`
`272`	`272`
`273`	`273`	`isJunk = func(s []byte) bool {`
`274`	`274`	`return len(s) == 1 && (s[0] == ' ' \|\| s[0] == 'b')`
`275`	`275`	`}`
`276`	`276`	`sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),`
`277`	`277`	`splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)`
`278`		`- assertEqual(t, sm.bJunk, map[int32]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})`
	`278`	`+ assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})`
`279`	`279`	`}`
`280`	`280`
`281`	`281`	`func TestSFBugsRatioForNullSeqn(t *testing.T) {`