@@ -23,6 +23,8 @@ import (
23
23
"io"
24
24
"strings"
25
25
"unicode"
26
+ "crypto/sha1"
27
+ "encoding/binary"
26
28
)
27
29
28
30
func min (a , b int ) int {
@@ -68,6 +70,75 @@ type OpCode struct {
68
70
J2 int
69
71
}
70
72
73
+ // This is essentially a map from lines to line numbers, so that later it can
74
+ // be made a bit cleverer than the standard map in that it will not need to
75
+ // store copies of the lines.
76
+ // It needs to hold a reference to the underlying slice of lines.
77
+ type B2J struct {
78
+ store map [int32 ] [][]int
79
+ b []string
80
+ }
81
+
82
+ func _hash (line string ) int32 {
83
+ hasher := sha1 .New ()
84
+ bytes .NewBufferString (line ).WriteTo (hasher )
85
+ hash , _ := binary .ReadVarint (bytes .NewBuffer (hasher .Sum ([]byte {})))
86
+ return int32 (hash )
87
+ }
88
+
89
+ func newB2J (b []string ) * B2J {
90
+ b2j := B2J {store : map [int32 ] [][]int {}, b : b }
91
+ for lineno , line := range b {
92
+ h := _hash (line )
93
+ // Thanks to the qualities of sha1, the probability of having more than
94
+ // one line content with the same hash is very low. Nevertheless, store
95
+ // each of them in a different slot, that we can differentiate by
96
+ // looking at the line contents in the b slice.
97
+ for slotIndex , slot := range b2j .store [h ] {
98
+ if line == b [slot [0 ]] {
99
+ // The content already has a slot in its hash bucket. Just
100
+ // append the newly seen index to the slice in that slot
101
+ b2j.store [h ][slotIndex ] = append (slot , lineno )
102
+ continue
103
+ }
104
+ }
105
+ // The line content still has no slot. Create one with a single value.
106
+ b2j .store [h ] = append (b2j .store [h ], []int {lineno })
107
+ }
108
+ return & b2j
109
+ }
110
+
111
+ func (b2j * B2J ) get (line string ) []int {
112
+ // Thanks to the qualities of sha1, there should be very few (zero or one)
113
+ // slots, so the following loop is fast.
114
+ for _ , slot := range b2j .store [_hash (line )] {
115
+ if line == b2j .b [slot [0 ]] {
116
+ return slot
117
+ }
118
+ }
119
+ return []int {}
120
+ }
121
+
122
+ func (b2j * B2J ) delete (line string ) {
123
+ h := _hash (line )
124
+ slots := b2j .store [h ]
125
+ for slotIndex , slot := range slots {
126
+ if line == b2j .b [slot [0 ]] {
127
+ // Remove the whole slot from the list of slots
128
+ b2j .store [h ] = append (slots [:slotIndex ], slots [slotIndex + 1 :]... )
129
+ return
130
+ }
131
+ }
132
+ }
133
+
134
+ func (b2j * B2J ) iter (hook func (string , []int )) {
135
+ for _ , slots := range b2j .store {
136
+ for _ , slot := range slots {
137
+ hook (b2j .b [slot [0 ]], slot )
138
+ }
139
+ }
140
+ }
141
+
71
142
// SequenceMatcher compares sequence of strings. The basic
72
143
// algorithm predates, and is a little fancier than, an algorithm
73
144
// published in the late 1980's by Ratcliff and Obershelp under the
@@ -97,7 +168,7 @@ type OpCode struct {
97
168
type SequenceMatcher struct {
98
169
a []string
99
170
b []string
100
- b2j map [ string ][] int
171
+ b2j B2J
101
172
IsJunk func (string ) bool
102
173
autoJunk bool
103
174
bJunk map [string ]struct {}
@@ -160,24 +231,19 @@ func (m *SequenceMatcher) SetSeq2(b []string) {
160
231
161
232
func (m * SequenceMatcher ) chainB () {
162
233
// Populate line -> index mapping
163
- b2j := map [string ][]int {}
164
- for i , s := range m .b {
165
- indices := b2j [s ]
166
- indices = append (indices , i )
167
- b2j [s ] = indices
168
- }
234
+ b2j := * newB2J (m .b )
169
235
170
236
// Purge junk elements
171
237
m .bJunk = map [string ]struct {}{}
172
238
if m .IsJunk != nil {
173
239
junk := m .bJunk
174
- for s , _ := range b2j {
240
+ b2j . iter ( func ( s string , _ [] int ) {
175
241
if m .IsJunk (s ) {
176
242
junk [s ] = struct {}{}
177
243
}
178
- }
244
+ })
179
245
for s , _ := range junk {
180
- delete (b2j , s )
246
+ b2j . delete (s )
181
247
}
182
248
}
183
249
@@ -186,13 +252,13 @@ func (m *SequenceMatcher) chainB() {
186
252
n := len (m .b )
187
253
if m .autoJunk && n >= 200 {
188
254
ntest := n / 100 + 1
189
- for s , indices := range b2j {
255
+ b2j . iter ( func ( s string , indices [] int ) {
190
256
if len (indices ) > ntest {
191
257
popular [s ] = struct {}{}
192
258
}
193
- }
259
+ })
194
260
for s , _ := range popular {
195
- delete (b2j , s )
261
+ b2j . delete (s )
196
262
}
197
263
}
198
264
m .bPopular = popular
@@ -250,7 +316,7 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
250
316
// look at all instances of a[i] in b; note that because
251
317
// b2j has no junk keys, the loop is skipped if a[i] is junk
252
318
newj2len := map [int ]int {}
253
- for _ , j := range m.b2j [ m.a [i ]] {
319
+ for _ , j := range m .b2j . get ( m .a [i ]) {
254
320
// a[i] matches b[j]
255
321
if j < blo {
256
322
continue
0 commit comments