Skip to content

Commit a5f596a

Browse files
committed
Merge branch 'master' of github.com:ianbruene/go-difflib
2 parents 5339ce5 + 5e5c896 commit a5f596a

File tree

1 file changed

+80
-14
lines changed

1 file changed

+80
-14
lines changed

difflib/difflib.go

Lines changed: 80 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323
"io"
2424
"strings"
2525
"unicode"
26+
"crypto/sha1"
27+
"encoding/binary"
2628
)
2729

2830
func min(a, b int) int {
@@ -68,6 +70,75 @@ type OpCode struct {
6870
J2 int
6971
}
7072

73+
// This is essentially a map from lines to line numbers, so that later it can
74+
// be made a bit cleverer than the standard map in that it will not need to
75+
// store copies of the lines.
76+
// It needs to hold a reference to the underlying slice of lines.
77+
type B2J struct {
78+
store map[int32] [][]int
79+
b []string
80+
}
81+
82+
func _hash(line string) int32 {
83+
hasher := sha1.New()
84+
bytes.NewBufferString(line).WriteTo(hasher)
85+
hash, _ := binary.ReadVarint(bytes.NewBuffer(hasher.Sum([]byte{})))
86+
return int32(hash)
87+
}
88+
89+
func newB2J (b []string) *B2J {
90+
b2j := B2J{store: map[int32] [][]int{}, b: b}
91+
for lineno, line := range b {
92+
h := _hash(line)
93+
// Thanks to the qualities of sha1, the probability of having more than
94+
// one line content with the same hash is very low. Nevertheless, store
95+
// each of them in a different slot, that we can differentiate by
96+
// looking at the line contents in the b slice.
97+
for slotIndex, slot := range b2j.store[h] {
98+
if line == b[slot[0]] {
99+
// The content already has a slot in its hash bucket. Just
100+
// append the newly seen index to the slice in that slot
101+
b2j.store[h][slotIndex] = append(slot, lineno)
102+
continue
103+
}
104+
}
105+
// The line content still has no slot. Create one with a single value.
106+
b2j.store[h] = append(b2j.store[h], []int{lineno})
107+
}
108+
return &b2j
109+
}
110+
111+
func (b2j *B2J) get(line string) []int {
112+
// Thanks to the qualities of sha1, there should be very few (zero or one)
113+
// slots, so the following loop is fast.
114+
for _, slot := range b2j.store[_hash(line)] {
115+
if line == b2j.b[slot[0]] {
116+
return slot
117+
}
118+
}
119+
return []int{}
120+
}
121+
122+
func (b2j *B2J) delete(line string) {
123+
h := _hash(line)
124+
slots := b2j.store[h]
125+
for slotIndex, slot := range slots {
126+
if line == b2j.b[slot[0]] {
127+
// Remove the whole slot from the list of slots
128+
b2j.store[h] = append(slots[:slotIndex], slots[slotIndex+1:]...)
129+
return
130+
}
131+
}
132+
}
133+
134+
func (b2j *B2J) iter(hook func(string, []int)) {
135+
for _, slots := range b2j.store {
136+
for _, slot := range slots {
137+
hook(b2j.b[slot[0]], slot)
138+
}
139+
}
140+
}
141+
71142
// SequenceMatcher compares sequence of strings. The basic
72143
// algorithm predates, and is a little fancier than, an algorithm
73144
// published in the late 1980's by Ratcliff and Obershelp under the
@@ -97,7 +168,7 @@ type OpCode struct {
97168
type SequenceMatcher struct {
98169
a []string
99170
b []string
100-
b2j map[string][]int
171+
b2j B2J
101172
IsJunk func(string) bool
102173
autoJunk bool
103174
bJunk map[string]struct{}
@@ -160,24 +231,19 @@ func (m *SequenceMatcher) SetSeq2(b []string) {
160231

161232
func (m *SequenceMatcher) chainB() {
162233
// Populate line -> index mapping
163-
b2j := map[string][]int{}
164-
for i, s := range m.b {
165-
indices := b2j[s]
166-
indices = append(indices, i)
167-
b2j[s] = indices
168-
}
234+
b2j := *newB2J(m.b)
169235

170236
// Purge junk elements
171237
m.bJunk = map[string]struct{}{}
172238
if m.IsJunk != nil {
173239
junk := m.bJunk
174-
for s, _ := range b2j {
240+
b2j.iter(func (s string, _ []int){
175241
if m.IsJunk(s) {
176242
junk[s] = struct{}{}
177243
}
178-
}
244+
})
179245
for s, _ := range junk {
180-
delete(b2j, s)
246+
b2j.delete(s)
181247
}
182248
}
183249

@@ -186,13 +252,13 @@ func (m *SequenceMatcher) chainB() {
186252
n := len(m.b)
187253
if m.autoJunk && n >= 200 {
188254
ntest := n/100 + 1
189-
for s, indices := range b2j {
255+
b2j.iter(func (s string, indices []int){
190256
if len(indices) > ntest {
191257
popular[s] = struct{}{}
192258
}
193-
}
259+
})
194260
for s, _ := range popular {
195-
delete(b2j, s)
261+
b2j.delete(s)
196262
}
197263
}
198264
m.bPopular = popular
@@ -250,7 +316,7 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
250316
// look at all instances of a[i] in b; note that because
251317
// b2j has no junk keys, the loop is skipped if a[i] is junk
252318
newj2len := map[int]int{}
253-
for _, j := range m.b2j[m.a[i]] {
319+
for _, j := range m.b2j.get(m.a[i]) {
254320
// a[i] matches b[j]
255321
if j < blo {
256322
continue

0 commit comments

Comments
 (0)