Skip to content

Commit 4ca4252

Browse files
authored
Merge pull request ianbruene#7 from FrnchFrgg/master
Improve population of the "line contents -> indices in b" map
2 parents 7f0db69 + 9cc2d29 commit 4ca4252

File tree

4 files changed

+115
-124
lines changed

4 files changed

+115
-124
lines changed

difflib/bytes/bytes.go

Lines changed: 64 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ func calculateRatio(matches, length int) float64 {
4949

5050
func listifyString(str []byte) (lst [][]byte) {
5151
lst = make([][]byte, len(str))
52-
for i, c := range str {
53-
lst[i] = []byte{c}
52+
for i := range str {
53+
lst[i] = str[i:i+1]
5454
}
5555
return lst
5656
}
@@ -84,62 +84,79 @@ type B2J struct {
8484
b [][]byte
8585
}
8686

87-
func newB2J (b [][]byte) *B2J {
88-
b2j := B2J{store: map[lineHash] [][]int{}, b: b}
89-
for lineno, line := range b {
90-
h := _hash(line)
87+
type lineType int8
88+
const (
89+
lineNONE lineType = 0
90+
lineNORMAL lineType = 1
91+
lineJUNK lineType = -1
92+
linePOPULAR lineType = -2
93+
)
94+
95+
func (b2j *B2J) _find(line *[]byte) (h lineHash, slotIndex int,
96+
slot []int, lt lineType) {
97+
h = _hash(*line)
98+
for slotIndex, slot = range b2j.store[h] {
9199
// Thanks to the qualities of sha1, the probability of having more than
92100
// one line content with the same hash is very low. Nevertheless, store
93101
// each of them in a different slot, that we can differentiate by
94102
// looking at the line contents in the b slice.
95-
for slotIndex, slot := range b2j.store[h] {
96-
if bytes.Equal(line, b[slot[0]]) {
97-
// The content already has a slot in its hash bucket. Just
98-
// append the newly seen index to the slice in that slot
99-
b2j.store[h][slotIndex] = append(slot, lineno)
100-
goto cont
103+
// In place of all the line numbers where the line appears, a slot can
104+
// also contain [lineno, -1] if b[lineno] is junk.
105+
if bytes.Equal(*line, b2j.b[slot[0]]) {
106+
// The content already has a slot in its hash bucket.
107+
if len(slot) == 2 && slot[1] < 0 {
108+
lt = lineType(slot[1])
109+
} else {
110+
lt = lineNORMAL
101111
}
112+
return // every return variable has the correct value
102113
}
103-
// The line content still has no slot. Create one with a single value.
104-
b2j.store[h] = append(b2j.store[h], []int{lineno})
105-
cont:
106114
}
107-
return &b2j
115+
// The line content still has no slot.
116+
slotIndex = -1
117+
slot = nil
118+
lt = lineNONE
119+
return
108120
}
109121

110-
func (b2j *B2J) get(line []byte) []int {
111-
// Thanks to the qualities of sha1, there should be very few (zero or one)
112-
// slots, so the following loop is fast.
113-
for _, slot := range b2j.store[_hash(line)] {
114-
if bytes.Equal(line, b2j.b[slot[0]]) {
115-
return slot
116-
}
122+
func newB2J (b [][]byte, isJunk func([]byte) bool, autoJunk bool) *B2J {
123+
b2j := B2J{store: map[lineHash] [][]int{}, b: b}
124+
ntest := len(b)
125+
if autoJunk && ntest >= 200 {
126+
ntest = ntest/100 + 1
117127
}
118-
return []int{}
119-
}
120-
121-
func (b2j *B2J) delete(line []byte) {
122-
h := _hash(line)
123-
slots := b2j.store[h]
124-
for slotIndex, slot := range slots {
125-
if bytes.Equal(line, b2j.b[slot[0]]) {
126-
// Remove the whole slot from the list of slots
127-
b2j.store[h] = append(slots[:slotIndex], slots[slotIndex+1:]...)
128-
return
128+
for lineno, line := range b {
129+
h, slotIndex, slot, lt := b2j._find(&line)
130+
switch lt {
131+
case lineNORMAL:
132+
if len(slot) >= ntest {
133+
b2j.store[h][slotIndex] = []int{slot[0], int(linePOPULAR)}
134+
} else {
135+
b2j.store[h][slotIndex] = append(slot, lineno)
136+
}
137+
case lineNONE:
138+
if isJunk != nil && isJunk(line) {
139+
b2j.store[h] = append(b2j.store[h], []int{lineno, int(lineJUNK)})
140+
} else {
141+
b2j.store[h] = append(b2j.store[h], []int{lineno})
142+
}
143+
default:
129144
}
130145
}
146+
return &b2j
131147
}
132148

133-
func (b2j *B2J) deleteHash(h lineHash) {
134-
delete(b2j.store, h)
149+
func (b2j *B2J) get(line []byte) []int {
150+
_, _, slot, lt := b2j._find(&line)
151+
if lt == lineNORMAL {
152+
return slot
153+
}
154+
return []int{}
135155
}
136156

137-
func (b2j *B2J) iter(hook func([]byte, []int)) {
138-
for _, slots := range b2j.store {
139-
for _, slot := range slots {
140-
hook(b2j.b[slot[0]], slot)
141-
}
142-
}
157+
func (b2j *B2J) isBJunk(line []byte) bool {
158+
_, _, _, lt := b2j._find(&line)
159+
return lt == lineJUNK
143160
}
144161

145162
// SequenceMatcher compares sequence of strings. The basic
@@ -174,10 +191,8 @@ type SequenceMatcher struct {
174191
b2j B2J
175192
IsJunk func([]byte) bool
176193
autoJunk bool
177-
bJunk map[lineHash]struct{}
178194
matchingBlocks []Match
179195
fullBCount map[lineHash]int
180-
bPopular []int
181196
opCodes []OpCode
182197
}
183198

@@ -234,45 +249,10 @@ func (m *SequenceMatcher) SetSeq2(b [][]byte) {
234249

235250
func (m *SequenceMatcher) chainB() {
236251
// Populate line -> index mapping
237-
b2j := *newB2J(m.b)
238-
239-
// Purge junk elements
240-
m.bJunk = map[lineHash]struct{}{}
241-
if m.IsJunk != nil {
242-
junk := m.bJunk
243-
b2j.iter(func (s []byte, _ []int){
244-
if m.IsJunk(s) {
245-
junk[_hash(s)] = struct{}{}
246-
}
247-
})
248-
for h, _ := range junk {
249-
b2j.deleteHash(h)
250-
}
251-
}
252-
253-
// Purge remaining popular elements
254-
popular := []int{}
255-
n := len(m.b)
256-
if m.autoJunk && n >= 200 {
257-
ntest := n/100 + 1
258-
b2j.iter(func (s []byte, indices []int){
259-
if len(indices) > ntest {
260-
popular = append(popular, indices[0])
261-
}
262-
})
263-
for _, i := range popular {
264-
b2j.delete(m.b[i])
265-
}
266-
}
267-
m.bPopular = popular
252+
b2j := *newB2J(m.b, m.IsJunk, m.autoJunk)
268253
m.b2j = b2j
269254
}
270255

271-
func (m *SequenceMatcher) isBJunk(s []byte) bool {
272-
_, ok := m.bJunk[_hash(s)]
273-
return ok
274-
}
275-
276256
// Find longest matching block in a[alo:ahi] and b[blo:bhi].
277257
//
278258
// If IsJunk is not defined:
@@ -340,12 +320,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
340320
// "popular" non-junk elements aren't in b2j, which greatly speeds
341321
// the inner loop above, but also means "the best" match so far
342322
// doesn't contain any junk *or* popular non-junk elements.
343-
for besti > alo && bestj > blo && !m.isBJunk(m.b[bestj-1]) &&
323+
for besti > alo && bestj > blo && !m.b2j.isBJunk(m.b[bestj-1]) &&
344324
bytes.Equal(m.a[besti-1], m.b[bestj-1]) {
345325
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
346326
}
347327
for besti+bestsize < ahi && bestj+bestsize < bhi &&
348-
!m.isBJunk(m.b[bestj+bestsize]) &&
328+
!m.b2j.isBJunk(m.b[bestj+bestsize]) &&
349329
bytes.Equal(m.a[besti+bestsize], m.b[bestj+bestsize]) {
350330
bestsize += 1
351331
}
@@ -357,12 +337,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
357337
// figuring out what to do with it. In the case of an empty
358338
// interesting match, this is clearly the right thing to do,
359339
// because no other kind of match is possible in the regions.
360-
for besti > alo && bestj > blo && m.isBJunk(m.b[bestj-1]) &&
340+
for besti > alo && bestj > blo && m.b2j.isBJunk(m.b[bestj-1]) &&
361341
bytes.Equal(m.a[besti-1], m.b[bestj-1]) {
362342
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
363343
}
364344
for besti+bestsize < ahi && bestj+bestsize < bhi &&
365-
m.isBJunk(m.b[bestj+bestsize]) &&
345+
m.b2j.isBJunk(m.b[bestj+bestsize]) &&
366346
bytes.Equal(m.a[besti+bestsize], m.b[bestj+bestsize]) {
367347
bestsize += 1
368348
}

difflib/bytes/bytes_test.go

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"reflect"
88
"strings"
99
"testing"
10+
"sort"
1011
)
1112

1213
func assertAlmostEqual(t *testing.T, a, b float64, places int) {
@@ -234,20 +235,41 @@ func rep(s string, count int) string {
234235
return strings.Repeat(s, count)
235236
}
236237

238+
func getall(b2j *B2J, lt lineType) [][]byte {
239+
result := []int{}
240+
for _, slots := range b2j.store {
241+
for _, slot := range slots {
242+
slt := lineNORMAL
243+
if len(slot) == 2 && slot[1] < 0 {
244+
slt = lineType(slot[1])
245+
}
246+
if lt == slt {
247+
result = append(result, slot[0])
248+
}
249+
}
250+
}
251+
sort.Ints(result)
252+
lines := make([][]byte, len(result))
253+
for i, lineno := range result {
254+
lines[i] = b2j.b[lineno]
255+
}
256+
return lines
257+
}
258+
237259
func TestWithAsciiOneInsert(t *testing.T) {
238260
sm := NewMatcher(splitChars(rep("b", 100)),
239261
splitChars("a"+rep("b", 100)))
240262
assertAlmostEqual(t, sm.Ratio(), 0.995, 3)
241263
assertEqual(t, sm.GetOpCodes(),
242264
[]OpCode{{'i', 0, 0, 0, 1}, {'e', 0, 100, 1, 101}})
243-
assertEqual(t, len(sm.bPopular), 0)
265+
assertEqual(t, len(getall(&sm.b2j, linePOPULAR)), 0)
244266

245267
sm = NewMatcher(splitChars(rep("b", 100)),
246268
splitChars(rep("b", 50)+"a"+rep("b", 50)))
247269
assertAlmostEqual(t, sm.Ratio(), 0.995, 3)
248270
assertEqual(t, sm.GetOpCodes(),
249271
[]OpCode{{'e', 0, 50, 0, 50}, {'i', 50, 50, 50, 51}, {'e', 50, 100, 51, 101}})
250-
assertEqual(t, len(sm.bPopular), 0)
272+
assertEqual(t, len(getall(&sm.b2j, linePOPULAR)), 0)
251273
}
252274

253275
func TestWithAsciiOnDelete(t *testing.T) {
@@ -264,18 +286,18 @@ func TestWithAsciiBJunk(t *testing.T) {
264286
}
265287
sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
266288
splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)
267-
assertEqual(t, sm.bJunk, map[lineHash]struct{}{})
289+
assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{})
268290

269291
sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
270292
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
271-
assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}})
293+
assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{SPACE})
272294

273295
isJunk = func(s []byte) bool {
274296
return len(s) == 1 && (s[0] == ' ' || s[0] == 'b')
275297
}
276298
sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
277299
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
278-
assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})
300+
assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{[]byte{'b'}, SPACE})
279301
}
280302

281303
func TestSFBugsRatioForNullSeqn(t *testing.T) {

difflib/difflib.go

Lines changed: 21 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,10 @@ type SequenceMatcher struct {
100100
b2j map[string][]int
101101
IsJunk func(string) bool
102102
autoJunk bool
103-
bJunk map[string]struct{}
103+
bJunk map[string]bool
104104
matchingBlocks []Match
105105
fullBCount map[string]int
106-
bPopular map[string]struct{}
106+
bPopular map[string]bool
107107
opCodes []OpCode
108108
}
109109

@@ -161,42 +161,31 @@ func (m *SequenceMatcher) SetSeq2(b []string) {
161161
func (m *SequenceMatcher) chainB() {
162162
// Populate line -> index mapping
163163
b2j := map[string][]int{}
164-
for i, s := range m.b {
165-
indices := b2j[s]
166-
indices = append(indices, i)
167-
b2j[s] = indices
164+
junk := map[string]bool{}
165+
popular := map[string]bool{}
166+
ntest := len(m.b)
167+
if m.autoJunk && ntest >= 200 {
168+
ntest = ntest/100 + 1
168169
}
169-
170-
// Purge junk elements
171-
m.bJunk = map[string]struct{}{}
172-
if m.IsJunk != nil {
173-
junk := m.bJunk
174-
for s, _ := range b2j {
175-
if m.IsJunk(s) {
176-
junk[s] = struct{}{}
170+
for i, s := range m.b {
171+
if !junk[s] {
172+
if m.IsJunk != nil && m.IsJunk(s) {
173+
junk[s] = true
174+
} else if !popular[s] {
175+
ids := append(b2j[s], i)
176+
if len(ids) <= ntest {
177+
b2j[s] = ids
178+
} else {
179+
delete(b2j, s)
180+
popular[s] = true
181+
}
177182
}
178183
}
179-
for s, _ := range junk {
180-
delete(b2j, s)
181-
}
182184
}
183185

184-
// Purge remaining popular elements
185-
popular := map[string]struct{}{}
186-
n := len(m.b)
187-
if m.autoJunk && n >= 200 {
188-
ntest := n/100 + 1
189-
for s, indices := range b2j {
190-
if len(indices) > ntest {
191-
popular[s] = struct{}{}
192-
}
193-
}
194-
for s, _ := range popular {
195-
delete(b2j, s)
196-
}
197-
}
198-
m.bPopular = popular
199186
m.b2j = b2j
187+
m.bJunk = junk
188+
m.bPopular = popular
200189
}
201190

202191
func (m *SequenceMatcher) isBJunk(s string) bool {

difflib/difflib_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,18 +248,18 @@ func TestWithAsciiBJunk(t *testing.T) {
248248
}
249249
sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
250250
splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)
251-
assertEqual(t, sm.bJunk, map[string]struct{}{})
251+
assertEqual(t, sm.bJunk, map[string]bool{})
252252

253253
sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
254254
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
255-
assertEqual(t, sm.bJunk, map[string]struct{}{" ": struct{}{}})
255+
assertEqual(t, sm.bJunk, map[string]bool{" ": true})
256256

257257
isJunk = func(s string) bool {
258258
return s == " " || s == "b"
259259
}
260260
sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
261261
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
262-
assertEqual(t, sm.bJunk, map[string]struct{}{" ": struct{}{}, "b": struct{}{}})
262+
assertEqual(t, sm.bJunk, map[string]bool{" ": true, "b": true})
263263
}
264264

265265
func TestSFBugsRatioForNullSeqn(t *testing.T) {

0 commit comments

Comments
 (0)