@@ -49,8 +49,8 @@ func calculateRatio(matches, length int) float64 {
49
49
50
50
func listifyString (str []byte ) (lst [][]byte ) {
51
51
lst = make ([][]byte , len (str ))
52
- for i , c := range str {
53
- lst [i ] = [] byte { c }
52
+ for i := range str {
53
+ lst [i ] = str [ i : i + 1 ]
54
54
}
55
55
return lst
56
56
}
@@ -84,62 +84,79 @@ type B2J struct {
84
84
b [][]byte
85
85
}
86
86
87
- func newB2J (b [][]byte ) * B2J {
88
- b2j := B2J {store : map [lineHash ] [][]int {}, b : b }
89
- for lineno , line := range b {
90
- h := _hash (line )
87
+ type lineType int8
88
+ const (
89
+ lineNONE lineType = 0
90
+ lineNORMAL lineType = 1
91
+ lineJUNK lineType = - 1
92
+ linePOPULAR lineType = - 2
93
+ )
94
+
95
+ func (b2j * B2J ) _find (line * []byte ) (h lineHash , slotIndex int ,
96
+ slot []int , lt lineType ) {
97
+ h = _hash (* line )
98
+ for slotIndex , slot = range b2j .store [h ] {
91
99
// Thanks to the qualities of sha1, the probability of having more than
92
100
// one line content with the same hash is very low. Nevertheless, store
93
101
// each of them in a different slot, that we can differentiate by
94
102
// looking at the line contents in the b slice.
95
- for slotIndex , slot := range b2j .store [h ] {
96
- if bytes .Equal (line , b [slot [0 ]]) {
97
- // The content already has a slot in its hash bucket. Just
98
- // append the newly seen index to the slice in that slot
99
- b2j.store [h ][slotIndex ] = append (slot , lineno )
100
- goto cont
103
+ // In place of all the line numbers where the line appears, a slot can
104
+ // also contain [lineno, -1] if b[lineno] is junk.
105
+ if bytes .Equal (* line , b2j .b [slot [0 ]]) {
106
+ // The content already has a slot in its hash bucket.
107
+ if len (slot ) == 2 && slot [1 ] < 0 {
108
+ lt = lineType (slot [1 ])
109
+ } else {
110
+ lt = lineNORMAL
101
111
}
112
+ return // every return variable has the correct value
102
113
}
103
- // The line content still has no slot. Create one with a single value.
104
- b2j .store [h ] = append (b2j .store [h ], []int {lineno })
105
- cont:
106
114
}
107
- return & b2j
115
+ // The line content still has no slot.
116
+ slotIndex = - 1
117
+ slot = nil
118
+ lt = lineNONE
119
+ return
108
120
}
109
121
110
- func (b2j * B2J ) get (line []byte ) []int {
111
- // Thanks to the qualities of sha1, there should be very few (zero or one)
112
- // slots, so the following loop is fast.
113
- for _ , slot := range b2j .store [_hash (line )] {
114
- if bytes .Equal (line , b2j .b [slot [0 ]]) {
115
- return slot
116
- }
122
+ func newB2J (b [][]byte , isJunk func ([]byte ) bool , autoJunk bool ) * B2J {
123
+ b2j := B2J {store : map [lineHash ] [][]int {}, b : b }
124
+ ntest := len (b )
125
+ if autoJunk && ntest >= 200 {
126
+ ntest = ntest / 100 + 1
117
127
}
118
- return []int {}
119
- }
120
-
121
- func (b2j * B2J ) delete (line []byte ) {
122
- h := _hash (line )
123
- slots := b2j .store [h ]
124
- for slotIndex , slot := range slots {
125
- if bytes .Equal (line , b2j .b [slot [0 ]]) {
126
- // Remove the whole slot from the list of slots
127
- b2j .store [h ] = append (slots [:slotIndex ], slots [slotIndex + 1 :]... )
128
- return
128
+ for lineno , line := range b {
129
+ h , slotIndex , slot , lt := b2j ._find (& line )
130
+ switch lt {
131
+ case lineNORMAL :
132
+ if len (slot ) >= ntest {
133
+ b2j.store [h ][slotIndex ] = []int {slot [0 ], int (linePOPULAR )}
134
+ } else {
135
+ b2j.store [h ][slotIndex ] = append (slot , lineno )
136
+ }
137
+ case lineNONE :
138
+ if isJunk != nil && isJunk (line ) {
139
+ b2j .store [h ] = append (b2j .store [h ], []int {lineno , int (lineJUNK )})
140
+ } else {
141
+ b2j .store [h ] = append (b2j .store [h ], []int {lineno })
142
+ }
143
+ default :
129
144
}
130
145
}
146
+ return & b2j
131
147
}
132
148
133
- func (b2j * B2J ) deleteHash (h lineHash ) {
134
- delete (b2j .store , h )
149
+ func (b2j * B2J ) get (line []byte ) []int {
150
+ _ , _ , slot , lt := b2j ._find (& line )
151
+ if lt == lineNORMAL {
152
+ return slot
153
+ }
154
+ return []int {}
135
155
}
136
156
137
- func (b2j * B2J ) iter (hook func ([]byte , []int )) {
138
- for _ , slots := range b2j .store {
139
- for _ , slot := range slots {
140
- hook (b2j .b [slot [0 ]], slot )
141
- }
142
- }
157
+ func (b2j * B2J ) isBJunk (line []byte ) bool {
158
+ _ , _ , _ , lt := b2j ._find (& line )
159
+ return lt == lineJUNK
143
160
}
144
161
145
162
// SequenceMatcher compares sequence of strings. The basic
@@ -174,10 +191,8 @@ type SequenceMatcher struct {
174
191
b2j B2J
175
192
IsJunk func ([]byte ) bool
176
193
autoJunk bool
177
- bJunk map [lineHash ]struct {}
178
194
matchingBlocks []Match
179
195
fullBCount map [lineHash ]int
180
- bPopular []int
181
196
opCodes []OpCode
182
197
}
183
198
@@ -234,45 +249,10 @@ func (m *SequenceMatcher) SetSeq2(b [][]byte) {
234
249
235
250
func (m * SequenceMatcher ) chainB () {
236
251
// Populate line -> index mapping
237
- b2j := * newB2J (m .b )
238
-
239
- // Purge junk elements
240
- m .bJunk = map [lineHash ]struct {}{}
241
- if m .IsJunk != nil {
242
- junk := m .bJunk
243
- b2j .iter (func (s []byte , _ []int ){
244
- if m .IsJunk (s ) {
245
- junk [_hash (s )] = struct {}{}
246
- }
247
- })
248
- for h , _ := range junk {
249
- b2j .deleteHash (h )
250
- }
251
- }
252
-
253
- // Purge remaining popular elements
254
- popular := []int {}
255
- n := len (m .b )
256
- if m .autoJunk && n >= 200 {
257
- ntest := n / 100 + 1
258
- b2j .iter (func (s []byte , indices []int ){
259
- if len (indices ) > ntest {
260
- popular = append (popular , indices [0 ])
261
- }
262
- })
263
- for _ , i := range popular {
264
- b2j .delete (m .b [i ])
265
- }
266
- }
267
- m .bPopular = popular
252
+ b2j := * newB2J (m .b , m .IsJunk , m .autoJunk )
268
253
m .b2j = b2j
269
254
}
270
255
271
- func (m * SequenceMatcher ) isBJunk (s []byte ) bool {
272
- _ , ok := m .bJunk [_hash (s )]
273
- return ok
274
- }
275
-
276
256
// Find longest matching block in a[alo:ahi] and b[blo:bhi].
277
257
//
278
258
// If IsJunk is not defined:
@@ -340,12 +320,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
340
320
// "popular" non-junk elements aren't in b2j, which greatly speeds
341
321
// the inner loop above, but also means "the best" match so far
342
322
// doesn't contain any junk *or* popular non-junk elements.
343
- for besti > alo && bestj > blo && ! m .isBJunk (m .b [bestj - 1 ]) &&
323
+ for besti > alo && bestj > blo && ! m .b2j . isBJunk (m .b [bestj - 1 ]) &&
344
324
bytes .Equal (m .a [besti - 1 ], m .b [bestj - 1 ]) {
345
325
besti , bestj , bestsize = besti - 1 , bestj - 1 , bestsize + 1
346
326
}
347
327
for besti + bestsize < ahi && bestj + bestsize < bhi &&
348
- ! m .isBJunk (m .b [bestj + bestsize ]) &&
328
+ ! m .b2j . isBJunk (m .b [bestj + bestsize ]) &&
349
329
bytes .Equal (m .a [besti + bestsize ], m .b [bestj + bestsize ]) {
350
330
bestsize += 1
351
331
}
@@ -357,12 +337,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
357
337
// figuring out what to do with it. In the case of an empty
358
338
// interesting match, this is clearly the right thing to do,
359
339
// because no other kind of match is possible in the regions.
360
- for besti > alo && bestj > blo && m .isBJunk (m .b [bestj - 1 ]) &&
340
+ for besti > alo && bestj > blo && m .b2j . isBJunk (m .b [bestj - 1 ]) &&
361
341
bytes .Equal (m .a [besti - 1 ], m .b [bestj - 1 ]) {
362
342
besti , bestj , bestsize = besti - 1 , bestj - 1 , bestsize + 1
363
343
}
364
344
for besti + bestsize < ahi && bestj + bestsize < bhi &&
365
- m .isBJunk (m .b [bestj + bestsize ]) &&
345
+ m .b2j . isBJunk (m .b [bestj + bestsize ]) &&
366
346
bytes .Equal (m .a [besti + bestsize ], m .b [bestj + bestsize ]) {
367
347
bestsize += 1
368
348
}
0 commit comments