Skip to content

Improve population of the "line contents -> indices in b" map #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 64 additions & 84 deletions difflib/bytes/bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ func calculateRatio(matches, length int) float64 {

func listifyString(str []byte) (lst [][]byte) {
lst = make([][]byte, len(str))
for i, c := range str {
lst[i] = []byte{c}
for i := range str {
lst[i] = str[i:i+1]
}
return lst
}
Expand Down Expand Up @@ -84,62 +84,79 @@ type B2J struct {
b [][]byte
}

func newB2J (b [][]byte) *B2J {
b2j := B2J{store: map[lineHash] [][]int{}, b: b}
for lineno, line := range b {
h := _hash(line)
type lineType int8
const (
lineNONE lineType = 0
lineNORMAL lineType = 1
lineJUNK lineType = -1
linePOPULAR lineType = -2
)

func (b2j *B2J) _find(line *[]byte) (h lineHash, slotIndex int,
slot []int, lt lineType) {
h = _hash(*line)
for slotIndex, slot = range b2j.store[h] {
// Thanks to the qualities of sha1, the probability of having more than
// one line content with the same hash is very low. Nevertheless, store
// each of them in a different slot, that we can differentiate by
// looking at the line contents in the b slice.
for slotIndex, slot := range b2j.store[h] {
if bytes.Equal(line, b[slot[0]]) {
// The content already has a slot in its hash bucket. Just
// append the newly seen index to the slice in that slot
b2j.store[h][slotIndex] = append(slot, lineno)
goto cont
// In place of all the line numbers where the line appears, a slot can
// also contain [lineno, -1] if b[lineno] is junk.
if bytes.Equal(*line, b2j.b[slot[0]]) {
// The content already has a slot in its hash bucket.
if len(slot) == 2 && slot[1] < 0 {
lt = lineType(slot[1])
} else {
lt = lineNORMAL
}
return // every return variable has the correct value
}
// The line content still has no slot. Create one with a single value.
b2j.store[h] = append(b2j.store[h], []int{lineno})
cont:
}
return &b2j
// The line content still has no slot.
slotIndex = -1
slot = nil
lt = lineNONE
return
}

func (b2j *B2J) get(line []byte) []int {
// Thanks to the qualities of sha1, there should be very few (zero or one)
// slots, so the following loop is fast.
for _, slot := range b2j.store[_hash(line)] {
if bytes.Equal(line, b2j.b[slot[0]]) {
return slot
}
func newB2J (b [][]byte, isJunk func([]byte) bool, autoJunk bool) *B2J {
b2j := B2J{store: map[lineHash] [][]int{}, b: b}
ntest := len(b)
if autoJunk && ntest >= 200 {
ntest = ntest/100 + 1
}
return []int{}
}

func (b2j *B2J) delete(line []byte) {
h := _hash(line)
slots := b2j.store[h]
for slotIndex, slot := range slots {
if bytes.Equal(line, b2j.b[slot[0]]) {
// Remove the whole slot from the list of slots
b2j.store[h] = append(slots[:slotIndex], slots[slotIndex+1:]...)
return
for lineno, line := range b {
h, slotIndex, slot, lt := b2j._find(&line)
switch lt {
case lineNORMAL:
if len(slot) >= ntest {
b2j.store[h][slotIndex] = []int{slot[0], int(linePOPULAR)}
} else {
b2j.store[h][slotIndex] = append(slot, lineno)
}
case lineNONE:
if isJunk != nil && isJunk(line) {
b2j.store[h] = append(b2j.store[h], []int{lineno, int(lineJUNK)})
} else {
b2j.store[h] = append(b2j.store[h], []int{lineno})
}
default:
}
}
return &b2j
}

func (b2j *B2J) deleteHash(h lineHash) {
delete(b2j.store, h)
func (b2j *B2J) get(line []byte) []int {
_, _, slot, lt := b2j._find(&line)
if lt == lineNORMAL {
return slot
}
return []int{}
}

func (b2j *B2J) iter(hook func([]byte, []int)) {
for _, slots := range b2j.store {
for _, slot := range slots {
hook(b2j.b[slot[0]], slot)
}
}
func (b2j *B2J) isBJunk(line []byte) bool {
_, _, _, lt := b2j._find(&line)
return lt == lineJUNK
}

// SequenceMatcher compares sequence of strings. The basic
Expand Down Expand Up @@ -174,10 +191,8 @@ type SequenceMatcher struct {
b2j B2J
IsJunk func([]byte) bool
autoJunk bool
bJunk map[lineHash]struct{}
matchingBlocks []Match
fullBCount map[lineHash]int
bPopular []int
opCodes []OpCode
}

Expand Down Expand Up @@ -234,45 +249,10 @@ func (m *SequenceMatcher) SetSeq2(b [][]byte) {

func (m *SequenceMatcher) chainB() {
// Populate line -> index mapping
b2j := *newB2J(m.b)

// Purge junk elements
m.bJunk = map[lineHash]struct{}{}
if m.IsJunk != nil {
junk := m.bJunk
b2j.iter(func (s []byte, _ []int){
if m.IsJunk(s) {
junk[_hash(s)] = struct{}{}
}
})
for h, _ := range junk {
b2j.deleteHash(h)
}
}

// Purge remaining popular elements
popular := []int{}
n := len(m.b)
if m.autoJunk && n >= 200 {
ntest := n/100 + 1
b2j.iter(func (s []byte, indices []int){
if len(indices) > ntest {
popular = append(popular, indices[0])
}
})
for _, i := range popular {
b2j.delete(m.b[i])
}
}
m.bPopular = popular
b2j := *newB2J(m.b, m.IsJunk, m.autoJunk)
m.b2j = b2j
}

func (m *SequenceMatcher) isBJunk(s []byte) bool {
_, ok := m.bJunk[_hash(s)]
return ok
}

// Find longest matching block in a[alo:ahi] and b[blo:bhi].
//
// If IsJunk is not defined:
Expand Down Expand Up @@ -340,12 +320,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
// "popular" non-junk elements aren't in b2j, which greatly speeds
// the inner loop above, but also means "the best" match so far
// doesn't contain any junk *or* popular non-junk elements.
for besti > alo && bestj > blo && !m.isBJunk(m.b[bestj-1]) &&
for besti > alo && bestj > blo && !m.b2j.isBJunk(m.b[bestj-1]) &&
bytes.Equal(m.a[besti-1], m.b[bestj-1]) {
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
}
for besti+bestsize < ahi && bestj+bestsize < bhi &&
!m.isBJunk(m.b[bestj+bestsize]) &&
!m.b2j.isBJunk(m.b[bestj+bestsize]) &&
bytes.Equal(m.a[besti+bestsize], m.b[bestj+bestsize]) {
bestsize += 1
}
Expand All @@ -357,12 +337,12 @@ func (m *SequenceMatcher) findLongestMatch(alo, ahi, blo, bhi int) Match {
// figuring out what to do with it. In the case of an empty
// interesting match, this is clearly the right thing to do,
// because no other kind of match is possible in the regions.
for besti > alo && bestj > blo && m.isBJunk(m.b[bestj-1]) &&
for besti > alo && bestj > blo && m.b2j.isBJunk(m.b[bestj-1]) &&
bytes.Equal(m.a[besti-1], m.b[bestj-1]) {
besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
}
for besti+bestsize < ahi && bestj+bestsize < bhi &&
m.isBJunk(m.b[bestj+bestsize]) &&
m.b2j.isBJunk(m.b[bestj+bestsize]) &&
bytes.Equal(m.a[besti+bestsize], m.b[bestj+bestsize]) {
bestsize += 1
}
Expand Down
32 changes: 27 additions & 5 deletions difflib/bytes/bytes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"reflect"
"strings"
"testing"
"sort"
)

func assertAlmostEqual(t *testing.T, a, b float64, places int) {
Expand Down Expand Up @@ -234,20 +235,41 @@ func rep(s string, count int) string {
return strings.Repeat(s, count)
}

func getall(b2j *B2J, lt lineType) [][]byte {
result := []int{}
for _, slots := range b2j.store {
for _, slot := range slots {
slt := lineNORMAL
if len(slot) == 2 && slot[1] < 0 {
slt = lineType(slot[1])
}
if lt == slt {
result = append(result, slot[0])
}
}
}
sort.Ints(result)
lines := make([][]byte, len(result))
for i, lineno := range result {
lines[i] = b2j.b[lineno]
}
return lines
}

func TestWithAsciiOneInsert(t *testing.T) {
sm := NewMatcher(splitChars(rep("b", 100)),
splitChars("a"+rep("b", 100)))
assertAlmostEqual(t, sm.Ratio(), 0.995, 3)
assertEqual(t, sm.GetOpCodes(),
[]OpCode{{'i', 0, 0, 0, 1}, {'e', 0, 100, 1, 101}})
assertEqual(t, len(sm.bPopular), 0)
assertEqual(t, len(getall(&sm.b2j, linePOPULAR)), 0)

sm = NewMatcher(splitChars(rep("b", 100)),
splitChars(rep("b", 50)+"a"+rep("b", 50)))
assertAlmostEqual(t, sm.Ratio(), 0.995, 3)
assertEqual(t, sm.GetOpCodes(),
[]OpCode{{'e', 0, 50, 0, 50}, {'i', 50, 50, 50, 51}, {'e', 50, 100, 51, 101}})
assertEqual(t, len(sm.bPopular), 0)
assertEqual(t, len(getall(&sm.b2j, linePOPULAR)), 0)
}

func TestWithAsciiOnDelete(t *testing.T) {
Expand All @@ -264,18 +286,18 @@ func TestWithAsciiBJunk(t *testing.T) {
}
sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)
assertEqual(t, sm.bJunk, map[lineHash]struct{}{})
assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{})

sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}})
assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{SPACE})

isJunk = func(s []byte) bool {
return len(s) == 1 && (s[0] == ' ' || s[0] == 'b')
}
sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
assertEqual(t, sm.bJunk, map[lineHash]struct{}{_hash(SPACE): struct{}{}, _hash([]byte{'b'}): struct{}{}})
assertEqual(t, getall(&sm.b2j, lineJUNK), [][]byte{[]byte{'b'}, SPACE})
}

func TestSFBugsRatioForNullSeqn(t *testing.T) {
Expand Down
53 changes: 21 additions & 32 deletions difflib/difflib.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,10 @@ type SequenceMatcher struct {
b2j map[string][]int
IsJunk func(string) bool
autoJunk bool
bJunk map[string]struct{}
bJunk map[string]bool
matchingBlocks []Match
fullBCount map[string]int
bPopular map[string]struct{}
bPopular map[string]bool
opCodes []OpCode
}

Expand Down Expand Up @@ -161,42 +161,31 @@ func (m *SequenceMatcher) SetSeq2(b []string) {
func (m *SequenceMatcher) chainB() {
// Populate line -> index mapping
b2j := map[string][]int{}
for i, s := range m.b {
indices := b2j[s]
indices = append(indices, i)
b2j[s] = indices
junk := map[string]bool{}
popular := map[string]bool{}
ntest := len(m.b)
if m.autoJunk && ntest >= 200 {
ntest = ntest/100 + 1
}

// Purge junk elements
m.bJunk = map[string]struct{}{}
if m.IsJunk != nil {
junk := m.bJunk
for s, _ := range b2j {
if m.IsJunk(s) {
junk[s] = struct{}{}
for i, s := range m.b {
if !junk[s] {
if m.IsJunk != nil && m.IsJunk(s) {
junk[s] = true
} else if !popular[s] {
ids := append(b2j[s], i)
if len(ids) <= ntest {
b2j[s] = ids
} else {
delete(b2j, s)
popular[s] = true
}
}
}
for s, _ := range junk {
delete(b2j, s)
}
}

// Purge remaining popular elements
popular := map[string]struct{}{}
n := len(m.b)
if m.autoJunk && n >= 200 {
ntest := n/100 + 1
for s, indices := range b2j {
if len(indices) > ntest {
popular[s] = struct{}{}
}
}
for s, _ := range popular {
delete(b2j, s)
}
}
m.bPopular = popular
m.b2j = b2j
m.bJunk = junk
m.bPopular = popular
}

func (m *SequenceMatcher) isBJunk(s string) bool {
Expand Down
6 changes: 3 additions & 3 deletions difflib/difflib_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,18 +248,18 @@ func TestWithAsciiBJunk(t *testing.T) {
}
sm := NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
splitChars(rep("a", 44)+rep("b", 40)), true, isJunk)
assertEqual(t, sm.bJunk, map[string]struct{}{})
assertEqual(t, sm.bJunk, map[string]bool{})

sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
assertEqual(t, sm.bJunk, map[string]struct{}{" ": struct{}{}})
assertEqual(t, sm.bJunk, map[string]bool{" ": true})

isJunk = func(s string) bool {
return s == " " || s == "b"
}
sm = NewMatcherWithJunk(splitChars(rep("a", 40)+rep("b", 40)),
splitChars(rep("a", 44)+rep("b", 40)+rep(" ", 20)), false, isJunk)
assertEqual(t, sm.bJunk, map[string]struct{}{" ": struct{}{}, "b": struct{}{}})
assertEqual(t, sm.bJunk, map[string]bool{" ": true, "b": true})
}

func TestSFBugsRatioForNullSeqn(t *testing.T) {
Expand Down