Skip to content

Commit 5a37dcb

Browse files
author
Julien "_FrnchFrgg_" RIVAUD
committed
Use hash.adler32 instead of sha1's first 4 bytes
It should avoid even more allocations, and be faster too. Also create a custom type for hash results for clarity and ease of change if we need to update the hash.
1 parent 790830c commit 5a37dcb

File tree

2 files changed

+16
-15
lines changed

2 files changed

+16
-15
lines changed

difflib/difflib.go

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import (
2323
"io"
2424
"strings"
2525
"unicode"
26-
"crypto/sha1"
26+
"hash/adler32"
2727
)
2828

2929
func min(a, b int) int {
@@ -69,22 +69,23 @@ type OpCode struct {
6969
J2 int
7070
}
7171

72-
func _hash(line []byte) int32 {
73-
h := sha1.Sum(line)
74-
return int32(h[0]) << 3*8 + int32(h[1]) << 2*8 + int32(h[2]) << 1*8 + int32(h[3])
72+
type lineHash uint32
73+
74+
func _hash(line []byte) lineHash {
75+
return lineHash(adler32.Checksum(line))
7576
}
7677

7778
// This is essentially a map from lines to line numbers, so that later it can
7879
// be made a bit cleverer than the standard map in that it will not need to
7980
// store copies of the lines.
8081
// It needs to hold a reference to the underlying slice of lines.
8182
type B2J struct {
82-
store map[int32] [][]int
83+
store map[lineHash] [][]int
8384
b [][]byte
8485
}
8586

8687
func newB2J (b [][]byte) *B2J {
87-
b2j := B2J{store: map[int32] [][]int{}, b: b}
88+
b2j := B2J{store: map[lineHash] [][]int{}, b: b}
8889
for lineno, line := range b {
8990
h := _hash(line)
9091
// Thanks to the qualities of sha1, the probability of having more than
@@ -129,7 +130,7 @@ func (b2j *B2J) delete(line []byte) {
129130
}
130131
}
131132

132-
func (b2j *B2J) deleteHash(h int32) {
133+
func (b2j *B2J) deleteHash(h lineHash) {
133134
delete(b2j.store, h)
134135
}
135136

@@ -173,9 +174,9 @@ type SequenceMatcher struct {
173174
b2j B2J
174175
IsJunk func([]byte) bool
175176
autoJunk bool
176-
bJunk map[int32]struct{}
177+
bJunk map[lineHash]struct{}
177178
matchingBlocks []Match
178-
fullBCount map[int32]int
179+
fullBCount map[lineHash]int
179180
bPopular []int
180181
opCodes []OpCode
181182
}
@@ -236,7 +237,7 @@ func (m *SequenceMatcher) chainB() {
236237
b2j := *newB2J(m.b)
237238

238239
// Purge junk elements
239-
m.bJunk = map[int32]struct{}{}
240+
m.bJunk = map[lineHash]struct{}{}
240241
if m.IsJunk != nil {
241242
junk := m.bJunk
242243
b2j.iter(func (s []byte, _ []int){
@@ -560,7 +561,7 @@ func (m *SequenceMatcher) QuickRatio() float64 {
560561
// greater due hash collisions incurring false positives, but
561562
// we don't care because we want an upper bound anyway.
562563
if m.fullBCount == nil {
563-
m.fullBCount = map[int32]int{}
564+
m.fullBCount = map[lineHash]int{}
564565
for _, s := range m.b {
565566
h := _hash(s)
566567
m.fullBCount[h] = m.fullBCount[h] + 1
@@ -569,7 +570,7 @@ func (m *SequenceMatcher) QuickRatio() float64 {
569570

570571
// avail[x] is the number of times x appears in 'b' less the
571572
// number of times we've seen it in 'a' so far ... kinda
572-
avail := map[int32]int{}
573+
avail := map[lineHash]int{}
573574
matches := 0
574575
for _, s := range m.a {
575576
h := _hash(s)

difflib/difflib_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -264,18 +264,18 @@ func TestWithAsciiBJunk(t *testing.T) {
264264
}
265265
sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
266266
splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)
267-
assertEqual(t, sm.bJunk, map[int32]struct{}{})
267+
assertEqual(t, sm.bJunk, map[lineHash]struct{}{})
268268

269269
sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
270270
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
271-
assertEqual(t, sm.bJunk, map[int32]struct{}{_hash(SPACE): struct{}{}})
271+
assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}})
272272

273273
isJunk = func(s []byte) bool {
274274
return len(s) == 1 && (s[0] == ' ' || s[0] == 'b')
275275
}
276276
sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
277277
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
278-
assertEqual(t, sm.bJunk, map[int32]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})
278+
assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})
279279
}
280280

281281
func TestSFBugsRatioForNullSeqn(t *testing.T) {

0 commit comments

Comments
 (0)